3 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 | /* SPDX-License-Identifier: GPL-2.0 */ #if !defined(_DRM_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ) #define _DRM_TRACE_H_ #include <linux/stringify.h> #include <linux/types.h> #include <linux/tracepoint.h> #undef TRACE_SYSTEM #define TRACE_SYSTEM drm #define TRACE_INCLUDE_FILE drm_trace TRACE_EVENT(drm_vblank_event, TP_PROTO(int crtc, unsigned int seq), TP_ARGS(crtc, seq), TP_STRUCT__entry( __field(int, crtc) __field(unsigned int, seq) ), TP_fast_assign( __entry->crtc = crtc; __entry->seq = seq; ), TP_printk("crtc=%d, seq=%u", __entry->crtc, __entry->seq) ); TRACE_EVENT(drm_vblank_event_queued, TP_PROTO(struct drm_file *file, int crtc, unsigned int seq), TP_ARGS(file, crtc, seq), TP_STRUCT__entry( __field(struct drm_file *, file) __field(int, crtc) __field(unsigned int, seq) ), TP_fast_assign( __entry->file = file; __entry->crtc = crtc; __entry->seq = seq; ), TP_printk("file=%p, crtc=%d, seq=%u", __entry->file, __entry->crtc, \ __entry->seq) ); TRACE_EVENT(drm_vblank_event_delivered, TP_PROTO(struct drm_file *file, int crtc, unsigned int seq), TP_ARGS(file, crtc, seq), TP_STRUCT__entry( __field(struct drm_file *, file) __field(int, crtc) __field(unsigned int, seq) ), TP_fast_assign( __entry->file = file; __entry->crtc = crtc; __entry->seq = seq; ), TP_printk("file=%p, crtc=%d, seq=%u", __entry->file, __entry->crtc, \ __entry->seq) ); #endif /* _DRM_TRACE_H_ */ /* This part must be outside protection */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH ../../drivers/gpu/drm #include <trace/define_trace.h> |
16 16 16 15 16 15 16 15 15 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) STRATO AG 2013. All rights reserved. */ #include <linux/uuid.h> #include <asm/unaligned.h> #include "ctree.h" #include "transaction.h" #include "disk-io.h" #include "print-tree.h" static void btrfs_uuid_to_key(u8 *uuid, u8 type, struct btrfs_key *key) { key->type = type; key->objectid = get_unaligned_le64(uuid); key->offset = get_unaligned_le64(uuid + sizeof(u64)); } /* return -ENOENT for !found, < 0 for errors, or 0 if an item was found */ static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, u8 *uuid, u8 type, u64 subid) { int ret; struct btrfs_path *path = NULL; struct extent_buffer *eb; int slot; u32 item_size; unsigned long offset; struct btrfs_key key; if (WARN_ON_ONCE(!uuid_root)) { ret = -ENOENT; goto out; } path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } btrfs_uuid_to_key(uuid, type, &key); ret = btrfs_search_slot(NULL, uuid_root, &key, path, 0, 0); if (ret < 0) { goto out; } else if (ret > 0) { ret = -ENOENT; goto out; } eb = path->nodes[0]; slot = path->slots[0]; item_size = btrfs_item_size_nr(eb, slot); offset = btrfs_item_ptr_offset(eb, slot); ret = -ENOENT; if (!IS_ALIGNED(item_size, sizeof(u64))) { btrfs_warn(uuid_root->fs_info, "uuid item with illegal size %lu!", (unsigned long)item_size); goto out; } while (item_size) { __le64 data; read_extent_buffer(eb, &data, offset, sizeof(data)); if (le64_to_cpu(data) == subid) { ret = 0; break; } offset += sizeof(data); item_size -= sizeof(data); } out: btrfs_free_path(path); return ret; } int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, u64 subid_cpu) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *uuid_root = fs_info->uuid_root; int ret; struct btrfs_path *path = NULL; struct btrfs_key key; struct extent_buffer *eb; int slot; unsigned long offset; __le64 subid_le; ret = btrfs_uuid_tree_lookup(uuid_root, uuid, type, subid_cpu); if (ret != -ENOENT) return ret; if (WARN_ON_ONCE(!uuid_root)) { ret = -EINVAL; goto out; } btrfs_uuid_to_key(uuid, type, &key); path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } ret = btrfs_insert_empty_item(trans, uuid_root, path, &key, sizeof(subid_le)); if (ret >= 0) { /* Add an item for the type for the first time */ eb = path->nodes[0]; slot = path->slots[0]; offset = btrfs_item_ptr_offset(eb, slot); } else if (ret == -EEXIST) { /* * An item with that type already exists. * Extend the item and store the new subid at the end. */ btrfs_extend_item(fs_info, path, sizeof(subid_le)); eb = path->nodes[0]; slot = path->slots[0]; offset = btrfs_item_ptr_offset(eb, slot); offset += btrfs_item_size_nr(eb, slot) - sizeof(subid_le); } else if (ret < 0) { btrfs_warn(fs_info, "insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!", ret, (unsigned long long)key.objectid, (unsigned long long)key.offset, type); goto out; } ret = 0; subid_le = cpu_to_le64(subid_cpu); write_extent_buffer(eb, &subid_le, offset, sizeof(subid_le)); btrfs_mark_buffer_dirty(eb); out: btrfs_free_path(path); return ret; } int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, u64 subid) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *uuid_root = fs_info->uuid_root; int ret; struct btrfs_path *path = NULL; struct btrfs_key key; struct extent_buffer *eb; int slot; unsigned long offset; u32 item_size; unsigned long move_dst; unsigned long move_src; unsigned long move_len; if (WARN_ON_ONCE(!uuid_root)) { ret = -EINVAL; goto out; } btrfs_uuid_to_key(uuid, type, &key); path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } ret = btrfs_search_slot(trans, uuid_root, &key, path, -1, 1); if (ret < 0) { btrfs_warn(fs_info, "error %d while searching for uuid item!", ret); goto out; } if (ret > 0) { ret = -ENOENT; goto out; } eb = path->nodes[0]; slot = path->slots[0]; offset = btrfs_item_ptr_offset(eb, slot); item_size = btrfs_item_size_nr(eb, slot); if (!IS_ALIGNED(item_size, sizeof(u64))) { btrfs_warn(fs_info, "uuid item with illegal size %lu!", (unsigned long)item_size); ret = -ENOENT; goto out; } while (item_size) { __le64 read_subid; read_extent_buffer(eb, &read_subid, offset, sizeof(read_subid)); if (le64_to_cpu(read_subid) == subid) break; offset += sizeof(read_subid); item_size -= sizeof(read_subid); } if (!item_size) { ret = -ENOENT; goto out; } item_size = btrfs_item_size_nr(eb, slot); if (item_size == sizeof(subid)) { ret = btrfs_del_item(trans, uuid_root, path); goto out; } move_dst = offset; move_src = offset + sizeof(subid); move_len = item_size - (move_src - btrfs_item_ptr_offset(eb, slot)); memmove_extent_buffer(eb, move_dst, move_src, move_len); btrfs_truncate_item(fs_info, path, item_size - sizeof(subid), 1); out: btrfs_free_path(path); return ret; } static int btrfs_uuid_iter_rem(struct btrfs_root *uuid_root, u8 *uuid, u8 type, u64 subid) { struct btrfs_trans_handle *trans; int ret; /* 1 - for the uuid item */ trans = btrfs_start_transaction(uuid_root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out; } ret = btrfs_uuid_tree_remove(trans, uuid, type, subid); btrfs_end_transaction(trans); out: return ret; } int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info, int (*check_func)(struct btrfs_fs_info *, u8 *, u8, u64)) { struct btrfs_root *root = fs_info->uuid_root; struct btrfs_key key; struct btrfs_path *path; int ret = 0; struct extent_buffer *leaf; int slot; u32 item_size; unsigned long offset; path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } key.objectid = 0; key.type = 0; key.offset = 0; again_search_slot: ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); if (ret) { if (ret > 0) ret = 0; goto out; } while (1) { cond_resched(); leaf = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.type != BTRFS_UUID_KEY_SUBVOL && key.type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) goto skip; offset = btrfs_item_ptr_offset(leaf, slot); item_size = btrfs_item_size_nr(leaf, slot); if (!IS_ALIGNED(item_size, sizeof(u64))) { btrfs_warn(fs_info, "uuid item with illegal size %lu!", (unsigned long)item_size); goto skip; } while (item_size) { u8 uuid[BTRFS_UUID_SIZE]; __le64 subid_le; u64 subid_cpu; put_unaligned_le64(key.objectid, uuid); put_unaligned_le64(key.offset, uuid + sizeof(u64)); read_extent_buffer(leaf, &subid_le, offset, sizeof(subid_le)); subid_cpu = le64_to_cpu(subid_le); ret = check_func(fs_info, uuid, key.type, subid_cpu); if (ret < 0) goto out; if (ret > 0) { btrfs_release_path(path); ret = btrfs_uuid_iter_rem(root, uuid, key.type, subid_cpu); if (ret == 0) { /* * this might look inefficient, but the * justification is that it is an * exception that check_func returns 1, * and that in the regular case only one * entry per UUID exists. */ goto again_search_slot; } if (ret < 0 && ret != -ENOENT) goto out; key.offset++; goto again_search_slot; } item_size -= sizeof(subid_le); offset += sizeof(subid_le); } skip: ret = btrfs_next_item(root, path); if (ret == 0) continue; else if (ret > 0) ret = 0; break; } out: btrfs_free_path(path); return ret; } |
36 28 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 | /* * Media device node * * Copyright (C) 2010 Nokia Corporation * * Contacts: Laurent Pinchart <laurent.pinchart@ideasonboard.com> * Sakari Ailus <sakari.ailus@iki.fi> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * -- * * Common functions for media-related drivers to register and unregister media * device nodes. */ #ifndef _MEDIA_DEVNODE_H #define _MEDIA_DEVNODE_H #include <linux/poll.h> #include <linux/fs.h> #include <linux/device.h> #include <linux/cdev.h> struct media_device; /* * Flag to mark the media_devnode struct as registered. Drivers must not touch * this flag directly, it will be set and cleared by media_devnode_register and * media_devnode_unregister. */ #define MEDIA_FLAG_REGISTERED 0 /** * struct media_file_operations - Media device file operations * * @owner: should be filled with %THIS_MODULE * @read: pointer to the function that implements read() syscall * @write: pointer to the function that implements write() syscall * @poll: pointer to the function that implements poll() syscall * @ioctl: pointer to the function that implements ioctl() syscall * @compat_ioctl: pointer to the function that will handle 32 bits userspace * calls to the the ioctl() syscall on a Kernel compiled with 64 bits. * @open: pointer to the function that implements open() syscall * @release: pointer to the function that will release the resources allocated * by the @open function. */ struct media_file_operations { struct module *owner; ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); __poll_t (*poll) (struct file *, struct poll_table_struct *); long (*ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*open) (struct file *); int (*release) (struct file *); }; /** * struct media_devnode - Media device node * @media_dev: pointer to struct &media_device * @fops: pointer to struct &media_file_operations with media device ops * @dev: pointer to struct &device containing the media controller device * @cdev: struct cdev pointer character device * @parent: parent device * @minor: device node minor number * @flags: flags, combination of the ``MEDIA_FLAG_*`` constants * @release: release callback called at the end of ``media_devnode_release()`` * routine at media-device.c. * * This structure represents a media-related device node. * * The @parent is a physical device. It must be set by core or device drivers * before registering the node. */ struct media_devnode { struct media_device *media_dev; /* device ops */ const struct media_file_operations *fops; /* sysfs */ struct device dev; /* media device */ struct cdev cdev; /* character device */ struct device *parent; /* device parent */ /* device info */ int minor; unsigned long flags; /* Use bitops to access flags */ /* callbacks */ void (*release)(struct media_devnode *devnode); }; /* dev to media_devnode */ #define to_media_devnode(cd) container_of(cd, struct media_devnode, dev) /** * media_devnode_register - register a media device node * * @mdev: struct media_device we want to register a device node * @devnode: media device node structure we want to register * @owner: should be filled with %THIS_MODULE * * The registration code assigns minor numbers and registers the new device node * with the kernel. An error is returned if no free minor number can be found, * or if the registration of the device node fails. * * Zero is returned on success. * * Note that if the media_devnode_register call fails, the release() callback of * the media_devnode structure is *not* called, so the caller is responsible for * freeing any data. */ int __must_check media_devnode_register(struct media_device *mdev, struct media_devnode *devnode, struct module *owner); /** * media_devnode_unregister_prepare - clear the media device node register bit * @devnode: the device node to prepare for unregister * * This clears the passed device register bit. Future open calls will be met * with errors. Should be called before media_devnode_unregister() to avoid * races with unregister and device file open calls. * * This function can safely be called if the device node has never been * registered or has already been unregistered. */ void media_devnode_unregister_prepare(struct media_devnode *devnode); /** * media_devnode_unregister - unregister a media device node * @devnode: the device node to unregister * * This unregisters the passed device. Future open calls will be met with * errors. * * Should be called after media_devnode_unregister_prepare() */ void media_devnode_unregister(struct media_devnode *devnode); /** * media_devnode_data - returns a pointer to the &media_devnode * * @filp: pointer to struct &file */ static inline struct media_devnode *media_devnode_data(struct file *filp) { return filp->private_data; } /** * media_devnode_is_registered - returns true if &media_devnode is registered; * false otherwise. * * @devnode: pointer to struct &media_devnode. * * Note: If mdev is NULL, it also returns false. */ static inline int media_devnode_is_registered(struct media_devnode *devnode) { if (!devnode) return false; return test_bit(MEDIA_FLAG_REGISTERED, &devnode->flags); } #endif /* _MEDIA_DEVNODE_H */ |
135 135 135 135 135 135 135 135 135 135 135 135 135 134 135 135 135 135 135 135 135 135 135 135 135 135 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 | // SPDX-License-Identifier: GPL-2.0 /* * Multipath support for RPC * * Copyright (c) 2015, 2016, Primary Data, Inc. All rights reserved. * * Trond Myklebust <trond.myklebust@primarydata.com> * */ #include <linux/types.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/rcupdate.h> #include <linux/rculist.h> #include <linux/slab.h> #include <asm/cmpxchg.h> #include <linux/spinlock.h> #include <linux/sunrpc/xprt.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/xprtmultipath.h> typedef struct rpc_xprt *(*xprt_switch_find_xprt_t)(struct list_head *head, const struct rpc_xprt *cur); static const struct rpc_xprt_iter_ops rpc_xprt_iter_singular; static const struct rpc_xprt_iter_ops rpc_xprt_iter_roundrobin; static const struct rpc_xprt_iter_ops rpc_xprt_iter_listall; static void xprt_switch_add_xprt_locked(struct rpc_xprt_switch *xps, struct rpc_xprt *xprt) { if (unlikely(xprt_get(xprt) == NULL)) return; list_add_tail_rcu(&xprt->xprt_switch, &xps->xps_xprt_list); smp_wmb(); if (xps->xps_nxprts == 0) xps->xps_net = xprt->xprt_net; xps->xps_nxprts++; } /** * rpc_xprt_switch_add_xprt - Add a new rpc_xprt to an rpc_xprt_switch * @xps: pointer to struct rpc_xprt_switch * @xprt: pointer to struct rpc_xprt * * Adds xprt to the end of the list of struct rpc_xprt in xps. */ void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps, struct rpc_xprt *xprt) { if (xprt == NULL) return; spin_lock(&xps->xps_lock); if ((xps->xps_net == xprt->xprt_net || xps->xps_net == NULL) && !rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr)) xprt_switch_add_xprt_locked(xps, xprt); spin_unlock(&xps->xps_lock); } static void xprt_switch_remove_xprt_locked(struct rpc_xprt_switch *xps, struct rpc_xprt *xprt) { if (unlikely(xprt == NULL)) return; xps->xps_nxprts--; if (xps->xps_nxprts == 0) xps->xps_net = NULL; smp_wmb(); list_del_rcu(&xprt->xprt_switch); } /** * rpc_xprt_switch_remove_xprt - Removes an rpc_xprt from a rpc_xprt_switch * @xps: pointer to struct rpc_xprt_switch * @xprt: pointer to struct rpc_xprt * * Removes xprt from the list of struct rpc_xprt in xps. */ void rpc_xprt_switch_remove_xprt(struct rpc_xprt_switch *xps, struct rpc_xprt *xprt) { spin_lock(&xps->xps_lock); xprt_switch_remove_xprt_locked(xps, xprt); spin_unlock(&xps->xps_lock); xprt_put(xprt); } /** * xprt_switch_alloc - Allocate a new struct rpc_xprt_switch * @xprt: pointer to struct rpc_xprt * @gfp_flags: allocation flags * * On success, returns an initialised struct rpc_xprt_switch, containing * the entry xprt. Returns NULL on failure. */ struct rpc_xprt_switch *xprt_switch_alloc(struct rpc_xprt *xprt, gfp_t gfp_flags) { struct rpc_xprt_switch *xps; xps = kmalloc(sizeof(*xps), gfp_flags); if (xps != NULL) { spin_lock_init(&xps->xps_lock); kref_init(&xps->xps_kref); xps->xps_nxprts = 0; INIT_LIST_HEAD(&xps->xps_xprt_list); xps->xps_iter_ops = &rpc_xprt_iter_singular; xprt_switch_add_xprt_locked(xps, xprt); } return xps; } static void xprt_switch_free_entries(struct rpc_xprt_switch *xps) { spin_lock(&xps->xps_lock); while (!list_empty(&xps->xps_xprt_list)) { struct rpc_xprt *xprt; xprt = list_first_entry(&xps->xps_xprt_list, struct rpc_xprt, xprt_switch); xprt_switch_remove_xprt_locked(xps, xprt); spin_unlock(&xps->xps_lock); xprt_put(xprt); spin_lock(&xps->xps_lock); } spin_unlock(&xps->xps_lock); } static void xprt_switch_free(struct kref *kref) { struct rpc_xprt_switch *xps = container_of(kref, struct rpc_xprt_switch, xps_kref); xprt_switch_free_entries(xps); kfree_rcu(xps, xps_rcu); } /** * xprt_switch_get - Return a reference to a rpc_xprt_switch * @xps: pointer to struct rpc_xprt_switch * * Returns a reference to xps unless the refcount is already zero. */ struct rpc_xprt_switch *xprt_switch_get(struct rpc_xprt_switch *xps) { if (xps != NULL && kref_get_unless_zero(&xps->xps_kref)) return xps; return NULL; } /** * xprt_switch_put - Release a reference to a rpc_xprt_switch * @xps: pointer to struct rpc_xprt_switch * * Release the reference to xps, and free it once the refcount is zero. */ void xprt_switch_put(struct rpc_xprt_switch *xps) { if (xps != NULL) kref_put(&xps->xps_kref, xprt_switch_free); } /** * rpc_xprt_switch_set_roundrobin - Set a round-robin policy on rpc_xprt_switch * @xps: pointer to struct rpc_xprt_switch * * Sets a round-robin default policy for iterators acting on xps. */ void rpc_xprt_switch_set_roundrobin(struct rpc_xprt_switch *xps) { if (READ_ONCE(xps->xps_iter_ops) != &rpc_xprt_iter_roundrobin) WRITE_ONCE(xps->xps_iter_ops, &rpc_xprt_iter_roundrobin); } static const struct rpc_xprt_iter_ops *xprt_iter_ops(const struct rpc_xprt_iter *xpi) { if (xpi->xpi_ops != NULL) return xpi->xpi_ops; return rcu_dereference(xpi->xpi_xpswitch)->xps_iter_ops; } static void xprt_iter_no_rewind(struct rpc_xprt_iter *xpi) { } static void xprt_iter_default_rewind(struct rpc_xprt_iter *xpi) { WRITE_ONCE(xpi->xpi_cursor, NULL); } static struct rpc_xprt *xprt_switch_find_first_entry(struct list_head *head) { return list_first_or_null_rcu(head, struct rpc_xprt, xprt_switch); } static struct rpc_xprt *xprt_iter_first_entry(struct rpc_xprt_iter *xpi) { struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch); if (xps == NULL) return NULL; return xprt_switch_find_first_entry(&xps->xps_xprt_list); } static struct rpc_xprt *xprt_switch_find_current_entry(struct list_head *head, const struct rpc_xprt *cur) { struct rpc_xprt *pos; list_for_each_entry_rcu(pos, head, xprt_switch) { if (cur == pos) return pos; } return NULL; } static struct rpc_xprt *xprt_iter_current_entry(struct rpc_xprt_iter *xpi) { struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch); struct list_head *head; if (xps == NULL) return NULL; head = &xps->xps_xprt_list; if (xpi->xpi_cursor == NULL || xps->xps_nxprts < 2) return xprt_switch_find_first_entry(head); return xprt_switch_find_current_entry(head, xpi->xpi_cursor); } bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps, const struct sockaddr *sap) { struct list_head *head; struct rpc_xprt *pos; if (xps == NULL || sap == NULL) return false; head = &xps->xps_xprt_list; list_for_each_entry_rcu(pos, head, xprt_switch) { if (rpc_cmp_addr_port(sap, (struct sockaddr *)&pos->addr)) { pr_info("RPC: addr %s already in xprt switch\n", pos->address_strings[RPC_DISPLAY_ADDR]); return true; } } return false; } static struct rpc_xprt *xprt_switch_find_next_entry(struct list_head *head, const struct rpc_xprt *cur) { struct rpc_xprt *pos, *prev = NULL; list_for_each_entry_rcu(pos, head, xprt_switch) { if (cur == prev) return pos; prev = pos; } return NULL; } static struct rpc_xprt *xprt_switch_set_next_cursor(struct list_head *head, struct rpc_xprt **cursor, xprt_switch_find_xprt_t find_next) { struct rpc_xprt *cur, *pos, *old; cur = READ_ONCE(*cursor); for (;;) { old = cur; pos = find_next(head, old); if (pos == NULL) break; cur = cmpxchg_relaxed(cursor, old, pos); if (cur == old) break; } return pos; } static struct rpc_xprt *xprt_iter_next_entry_multiple(struct rpc_xprt_iter *xpi, xprt_switch_find_xprt_t find_next) { struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch); if (xps == NULL) return NULL; return xprt_switch_set_next_cursor(&xps->xps_xprt_list, &xpi->xpi_cursor, find_next); } static struct rpc_xprt *xprt_switch_find_next_entry_roundrobin(struct list_head *head, const struct rpc_xprt *cur) { struct rpc_xprt *ret; ret = xprt_switch_find_next_entry(head, cur); if (ret != NULL) return ret; return xprt_switch_find_first_entry(head); } static struct rpc_xprt *xprt_iter_next_entry_roundrobin(struct rpc_xprt_iter *xpi) { return xprt_iter_next_entry_multiple(xpi, xprt_switch_find_next_entry_roundrobin); } static struct rpc_xprt *xprt_iter_next_entry_all(struct rpc_xprt_iter *xpi) { return xprt_iter_next_entry_multiple(xpi, xprt_switch_find_next_entry); } /* * xprt_iter_rewind - Resets the xprt iterator * @xpi: pointer to rpc_xprt_iter * * Resets xpi to ensure that it points to the first entry in the list * of transports. */ static void xprt_iter_rewind(struct rpc_xprt_iter *xpi) { rcu_read_lock(); xprt_iter_ops(xpi)->xpi_rewind(xpi); rcu_read_unlock(); } static void __xprt_iter_init(struct rpc_xprt_iter *xpi, struct rpc_xprt_switch *xps, const struct rpc_xprt_iter_ops *ops) { rcu_assign_pointer(xpi->xpi_xpswitch, xprt_switch_get(xps)); xpi->xpi_cursor = NULL; xpi->xpi_ops = ops; } /** * xprt_iter_init - Initialise an xprt iterator * @xpi: pointer to rpc_xprt_iter * @xps: pointer to rpc_xprt_switch * * Initialises the iterator to use the default iterator ops * as set in xps. This function is mainly intended for internal * use in the rpc_client. */ void xprt_iter_init(struct rpc_xprt_iter *xpi, struct rpc_xprt_switch *xps) { __xprt_iter_init(xpi, xps, NULL); } /** * xprt_iter_init_listall - Initialise an xprt iterator * @xpi: pointer to rpc_xprt_iter * @xps: pointer to rpc_xprt_switch * * Initialises the iterator to iterate once through the entire list * of entries in xps. */ void xprt_iter_init_listall(struct rpc_xprt_iter *xpi, struct rpc_xprt_switch *xps) { __xprt_iter_init(xpi, xps, &rpc_xprt_iter_listall); } /** * xprt_iter_xchg_switch - Atomically swap out the rpc_xprt_switch * @xpi: pointer to rpc_xprt_iter * @xps: pointer to a new rpc_xprt_switch or NULL * * Swaps out the existing xpi->xpi_xpswitch with a new value. */ struct rpc_xprt_switch *xprt_iter_xchg_switch(struct rpc_xprt_iter *xpi, struct rpc_xprt_switch *newswitch) { struct rpc_xprt_switch __rcu *oldswitch; /* Atomically swap out the old xpswitch */ oldswitch = xchg(&xpi->xpi_xpswitch, RCU_INITIALIZER(newswitch)); if (newswitch != NULL) xprt_iter_rewind(xpi); return rcu_dereference_protected(oldswitch, true); } /** * xprt_iter_destroy - Destroys the xprt iterator * @xpi pointer to rpc_xprt_iter */ void xprt_iter_destroy(struct rpc_xprt_iter *xpi) { xprt_switch_put(xprt_iter_xchg_switch(xpi, NULL)); } /** * xprt_iter_xprt - Returns the rpc_xprt pointed to by the cursor * @xpi: pointer to rpc_xprt_iter * * Returns a pointer to the struct rpc_xprt that is currently * pointed to by the cursor. * Caller must be holding rcu_read_lock(). */ struct rpc_xprt *xprt_iter_xprt(struct rpc_xprt_iter *xpi) { WARN_ON_ONCE(!rcu_read_lock_held()); return xprt_iter_ops(xpi)->xpi_xprt(xpi); } static struct rpc_xprt *xprt_iter_get_helper(struct rpc_xprt_iter *xpi, struct rpc_xprt *(*fn)(struct rpc_xprt_iter *)) { struct rpc_xprt *ret; do { ret = fn(xpi); if (ret == NULL) break; ret = xprt_get(ret); } while (ret == NULL); return ret; } /** * xprt_iter_get_xprt - Returns the rpc_xprt pointed to by the cursor * @xpi: pointer to rpc_xprt_iter * * Returns a reference to the struct rpc_xprt that is currently * pointed to by the cursor. */ struct rpc_xprt *xprt_iter_get_xprt(struct rpc_xprt_iter *xpi) { struct rpc_xprt *xprt; rcu_read_lock(); xprt = xprt_iter_get_helper(xpi, xprt_iter_ops(xpi)->xpi_xprt); rcu_read_unlock(); return xprt; } /** * xprt_iter_get_next - Returns the next rpc_xprt following the cursor * @xpi: pointer to rpc_xprt_iter * * Returns a reference to the struct rpc_xprt that immediately follows the * entry pointed to by the cursor. */ struct rpc_xprt *xprt_iter_get_next(struct rpc_xprt_iter *xpi) { struct rpc_xprt *xprt; rcu_read_lock(); xprt = xprt_iter_get_helper(xpi, xprt_iter_ops(xpi)->xpi_next); rcu_read_unlock(); return xprt; } /* Policy for always returning the first entry in the rpc_xprt_switch */ static const struct rpc_xprt_iter_ops rpc_xprt_iter_singular = { .xpi_rewind = xprt_iter_no_rewind, .xpi_xprt = xprt_iter_first_entry, .xpi_next = xprt_iter_first_entry, }; /* Policy for round-robin iteration of entries in the rpc_xprt_switch */ static const struct rpc_xprt_iter_ops rpc_xprt_iter_roundrobin = { .xpi_rewind = xprt_iter_default_rewind, .xpi_xprt = xprt_iter_current_entry, .xpi_next = xprt_iter_next_entry_roundrobin, }; /* Policy for once-through iteration of entries in the rpc_xprt_switch */ static const struct rpc_xprt_iter_ops rpc_xprt_iter_listall = { .xpi_rewind = xprt_iter_default_rewind, .xpi_xprt = xprt_iter_current_entry, .xpi_next = xprt_iter_next_entry_all, }; |
25 25 34 17 17 17 3264 3259 21 3237 3226 3438 17 964 962 22 22 22 17 17 17 17 1211 30 1291 2206 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 | #include <linux/kdebug.h> #include <linux/kprobes.h> #include <linux/export.h> #include <linux/notifier.h> #include <linux/rcupdate.h> #include <linux/vmalloc.h> #include <linux/reboot.h> /* * Notifier list for kernel code which wants to be called * at shutdown. This is used to stop any idling DMA operations * and the like. */ BLOCKING_NOTIFIER_HEAD(reboot_notifier_list); /* * Notifier chain core routines. The exported routines below * are layered on top of these, with appropriate locking added. */ static int notifier_chain_register(struct notifier_block **nl, struct notifier_block *n) { while ((*nl) != NULL) { if (n->priority > (*nl)->priority) break; nl = &((*nl)->next); } n->next = *nl; rcu_assign_pointer(*nl, n); return 0; } static int notifier_chain_cond_register(struct notifier_block **nl, struct notifier_block *n) { while ((*nl) != NULL) { if ((*nl) == n) return 0; if (n->priority > (*nl)->priority) break; nl = &((*nl)->next); } n->next = *nl; rcu_assign_pointer(*nl, n); return 0; } static int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n) { while ((*nl) != NULL) { if ((*nl) == n) { rcu_assign_pointer(*nl, n->next); return 0; } nl = &((*nl)->next); } return -ENOENT; } /** * notifier_call_chain - Informs the registered notifiers about an event. * @nl: Pointer to head of the blocking notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function * @nr_to_call: Number of notifier functions to be called. Don't care * value of this parameter is -1. * @nr_calls: Records the number of notifications sent. Don't care * value of this field is NULL. * @returns: notifier_call_chain returns the value returned by the * last notifier function called. */ static int notifier_call_chain(struct notifier_block **nl, unsigned long val, void *v, int nr_to_call, int *nr_calls) { int ret = NOTIFY_DONE; struct notifier_block *nb, *next_nb; nb = rcu_dereference_raw(*nl); while (nb && nr_to_call) { next_nb = rcu_dereference_raw(nb->next); #ifdef CONFIG_DEBUG_NOTIFIERS if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) { WARN(1, "Invalid notifier called!"); nb = next_nb; continue; } #endif ret = nb->notifier_call(nb, val, v); if (nr_calls) (*nr_calls)++; if (ret & NOTIFY_STOP_MASK) break; nb = next_nb; nr_to_call--; } return ret; } NOKPROBE_SYMBOL(notifier_call_chain); /* * Atomic notifier chain routines. Registration and unregistration * use a spinlock, and call_chain is synchronized by RCU (no locks). */ /** * atomic_notifier_chain_register - Add notifier to an atomic notifier chain * @nh: Pointer to head of the atomic notifier chain * @n: New entry in notifier chain * * Adds a notifier to an atomic notifier chain. * * Currently always returns zero. */ int atomic_notifier_chain_register(struct atomic_notifier_head *nh, struct notifier_block *n) { unsigned long flags; int ret; spin_lock_irqsave(&nh->lock, flags); ret = notifier_chain_register(&nh->head, n); spin_unlock_irqrestore(&nh->lock, flags); return ret; } EXPORT_SYMBOL_GPL(atomic_notifier_chain_register); /** * atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain * @nh: Pointer to head of the atomic notifier chain * @n: Entry to remove from notifier chain * * Removes a notifier from an atomic notifier chain. * * Returns zero on success or %-ENOENT on failure. */ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, struct notifier_block *n) { unsigned long flags; int ret; spin_lock_irqsave(&nh->lock, flags); ret = notifier_chain_unregister(&nh->head, n); spin_unlock_irqrestore(&nh->lock, flags); synchronize_rcu(); return ret; } EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); /** * __atomic_notifier_call_chain - Call functions in an atomic notifier chain * @nh: Pointer to head of the atomic notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function * @nr_to_call: See the comment for notifier_call_chain. * @nr_calls: See the comment for notifier_call_chain. * * Calls each function in a notifier chain in turn. The functions * run in an atomic context, so they must not block. * This routine uses RCU to synchronize with changes to the chain. * * If the return value of the notifier can be and'ed * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain() * will return immediately, with the return value of * the notifier function which halted execution. * Otherwise the return value is the return value * of the last notifier function called. */ int __atomic_notifier_call_chain(struct atomic_notifier_head *nh, unsigned long val, void *v, int nr_to_call, int *nr_calls) { int ret; rcu_read_lock(); ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain); NOKPROBE_SYMBOL(__atomic_notifier_call_chain); int atomic_notifier_call_chain(struct atomic_notifier_head *nh, unsigned long val, void *v) { return __atomic_notifier_call_chain(nh, val, v, -1, NULL); } EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); NOKPROBE_SYMBOL(atomic_notifier_call_chain); /* * Blocking notifier chain routines. All access to the chain is * synchronized by an rwsem. */ /** * blocking_notifier_chain_register - Add notifier to a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain * @n: New entry in notifier chain * * Adds a notifier to a blocking notifier chain. * Must be called in process context. * * Currently always returns zero. */ int blocking_notifier_chain_register(struct blocking_notifier_head *nh, struct notifier_block *n) { int ret; /* * This code gets used during boot-up, when task switching is * not yet working and interrupts must remain disabled. At * such times we must not call down_write(). */ if (unlikely(system_state == SYSTEM_BOOTING)) return notifier_chain_register(&nh->head, n); down_write(&nh->rwsem); ret = notifier_chain_register(&nh->head, n); up_write(&nh->rwsem); return ret; } EXPORT_SYMBOL_GPL(blocking_notifier_chain_register); /** * blocking_notifier_chain_cond_register - Cond add notifier to a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain * @n: New entry in notifier chain * * Adds a notifier to a blocking notifier chain, only if not already * present in the chain. * Must be called in process context. * * Currently always returns zero. */ int blocking_notifier_chain_cond_register(struct blocking_notifier_head *nh, struct notifier_block *n) { int ret; down_write(&nh->rwsem); ret = notifier_chain_cond_register(&nh->head, n); up_write(&nh->rwsem); return ret; } EXPORT_SYMBOL_GPL(blocking_notifier_chain_cond_register); /** * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain * @n: Entry to remove from notifier chain * * Removes a notifier from a blocking notifier chain. * Must be called from process context. * * Returns zero on success or %-ENOENT on failure. */ int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh, struct notifier_block *n) { int ret; /* * This code gets used during boot-up, when task switching is * not yet working and interrupts must remain disabled. At * such times we must not call down_write(). */ if (unlikely(system_state == SYSTEM_BOOTING)) return notifier_chain_unregister(&nh->head, n); down_write(&nh->rwsem); ret = notifier_chain_unregister(&nh->head, n); up_write(&nh->rwsem); return ret; } EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister); /** * __blocking_notifier_call_chain - Call functions in a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function * @nr_to_call: See comment for notifier_call_chain. * @nr_calls: See comment for notifier_call_chain. * * Calls each function in a notifier chain in turn. The functions * run in a process context, so they are allowed to block. * * If the return value of the notifier can be and'ed * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain() * will return immediately, with the return value of * the notifier function which halted execution. * Otherwise the return value is the return value * of the last notifier function called. */ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh, unsigned long val, void *v, int nr_to_call, int *nr_calls) { int ret = NOTIFY_DONE; /* * We check the head outside the lock, but if this access is * racy then it does not matter what the result of the test * is, we re-check the list after having taken the lock anyway: */ if (rcu_access_pointer(nh->head)) { down_read(&nh->rwsem); ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); up_read(&nh->rwsem); } return ret; } EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain); int blocking_notifier_call_chain(struct blocking_notifier_head *nh, unsigned long val, void *v) { return __blocking_notifier_call_chain(nh, val, v, -1, NULL); } EXPORT_SYMBOL_GPL(blocking_notifier_call_chain); /* * Raw notifier chain routines. There is no protection; * the caller must provide it. Use at your own risk! */ /** * raw_notifier_chain_register - Add notifier to a raw notifier chain * @nh: Pointer to head of the raw notifier chain * @n: New entry in notifier chain * * Adds a notifier to a raw notifier chain. * All locking must be provided by the caller. * * Currently always returns zero. */ int raw_notifier_chain_register(struct raw_notifier_head *nh, struct notifier_block *n) { return notifier_chain_register(&nh->head, n); } EXPORT_SYMBOL_GPL(raw_notifier_chain_register); /** * raw_notifier_chain_unregister - Remove notifier from a raw notifier chain * @nh: Pointer to head of the raw notifier chain * @n: Entry to remove from notifier chain * * Removes a notifier from a raw notifier chain. * All locking must be provided by the caller. * * Returns zero on success or %-ENOENT on failure. */ int raw_notifier_chain_unregister(struct raw_notifier_head *nh, struct notifier_block *n) { return notifier_chain_unregister(&nh->head, n); } EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister); /** * __raw_notifier_call_chain - Call functions in a raw notifier chain * @nh: Pointer to head of the raw notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function * @nr_to_call: See comment for notifier_call_chain. * @nr_calls: See comment for notifier_call_chain * * Calls each function in a notifier chain in turn. The functions * run in an undefined context. * All locking must be provided by the caller. * * If the return value of the notifier can be and'ed * with %NOTIFY_STOP_MASK then raw_notifier_call_chain() * will return immediately, with the return value of * the notifier function which halted execution. * Otherwise the return value is the return value * of the last notifier function called. */ int __raw_notifier_call_chain(struct raw_notifier_head *nh, unsigned long val, void *v, int nr_to_call, int *nr_calls) { return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); } EXPORT_SYMBOL_GPL(__raw_notifier_call_chain); int raw_notifier_call_chain(struct raw_notifier_head *nh, unsigned long val, void *v) { return __raw_notifier_call_chain(nh, val, v, -1, NULL); } EXPORT_SYMBOL_GPL(raw_notifier_call_chain); #ifdef CONFIG_SRCU /* * SRCU notifier chain routines. Registration and unregistration * use a mutex, and call_chain is synchronized by SRCU (no locks). */ /** * srcu_notifier_chain_register - Add notifier to an SRCU notifier chain * @nh: Pointer to head of the SRCU notifier chain * @n: New entry in notifier chain * * Adds a notifier to an SRCU notifier chain. * Must be called in process context. * * Currently always returns zero. */ int srcu_notifier_chain_register(struct srcu_notifier_head *nh, struct notifier_block *n) { int ret; /* * This code gets used during boot-up, when task switching is * not yet working and interrupts must remain disabled. At * such times we must not call mutex_lock(). */ if (unlikely(system_state == SYSTEM_BOOTING)) return notifier_chain_register(&nh->head, n); mutex_lock(&nh->mutex); ret = notifier_chain_register(&nh->head, n); mutex_unlock(&nh->mutex); return ret; } EXPORT_SYMBOL_GPL(srcu_notifier_chain_register); /** * srcu_notifier_chain_unregister - Remove notifier from an SRCU notifier chain * @nh: Pointer to head of the SRCU notifier chain * @n: Entry to remove from notifier chain * * Removes a notifier from an SRCU notifier chain. * Must be called from process context. * * Returns zero on success or %-ENOENT on failure. */ int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh, struct notifier_block *n) { int ret; /* * This code gets used during boot-up, when task switching is * not yet working and interrupts must remain disabled. At * such times we must not call mutex_lock(). */ if (unlikely(system_state == SYSTEM_BOOTING)) return notifier_chain_unregister(&nh->head, n); mutex_lock(&nh->mutex); ret = notifier_chain_unregister(&nh->head, n); mutex_unlock(&nh->mutex); synchronize_srcu(&nh->srcu); return ret; } EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister); /** * __srcu_notifier_call_chain - Call functions in an SRCU notifier chain * @nh: Pointer to head of the SRCU notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function * @nr_to_call: See comment for notifier_call_chain. * @nr_calls: See comment for notifier_call_chain * * Calls each function in a notifier chain in turn. The functions * run in a process context, so they are allowed to block. * * If the return value of the notifier can be and'ed * with %NOTIFY_STOP_MASK then srcu_notifier_call_chain() * will return immediately, with the return value of * the notifier function which halted execution. * Otherwise the return value is the return value * of the last notifier function called. */ int __srcu_notifier_call_chain(struct srcu_notifier_head *nh, unsigned long val, void *v, int nr_to_call, int *nr_calls) { int ret; int idx; idx = srcu_read_lock(&nh->srcu); ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); srcu_read_unlock(&nh->srcu, idx); return ret; } EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain); int srcu_notifier_call_chain(struct srcu_notifier_head *nh, unsigned long val, void *v) { return __srcu_notifier_call_chain(nh, val, v, -1, NULL); } EXPORT_SYMBOL_GPL(srcu_notifier_call_chain); /** * srcu_init_notifier_head - Initialize an SRCU notifier head * @nh: Pointer to head of the srcu notifier chain * * Unlike other sorts of notifier heads, SRCU notifier heads require * dynamic initialization. Be sure to call this routine before * calling any of the other SRCU notifier routines for this head. * * If an SRCU notifier head is deallocated, it must first be cleaned * up by calling srcu_cleanup_notifier_head(). Otherwise the head's * per-cpu data (used by the SRCU mechanism) will leak. */ void srcu_init_notifier_head(struct srcu_notifier_head *nh) { mutex_init(&nh->mutex); if (init_srcu_struct(&nh->srcu) < 0) BUG(); nh->head = NULL; } EXPORT_SYMBOL_GPL(srcu_init_notifier_head); #endif /* CONFIG_SRCU */ static ATOMIC_NOTIFIER_HEAD(die_chain); int notrace notify_die(enum die_val val, const char *str, struct pt_regs *regs, long err, int trap, int sig) { struct die_args args = { .regs = regs, .str = str, .err = err, .trapnr = trap, .signr = sig, }; RCU_LOCKDEP_WARN(!rcu_is_watching(), "notify_die called but RCU thinks we're quiescent"); return atomic_notifier_call_chain(&die_chain, val, &args); } NOKPROBE_SYMBOL(notify_die); int register_die_notifier(struct notifier_block *nb) { vmalloc_sync_mappings(); return atomic_notifier_chain_register(&die_chain, nb); } EXPORT_SYMBOL_GPL(register_die_notifier); int unregister_die_notifier(struct notifier_block *nb) { return atomic_notifier_chain_unregister(&die_chain, nb); } EXPORT_SYMBOL_GPL(unregister_die_notifier); |
1054 2964 7 4558 491 338 20 338 338 1388 1349 1355 977 1353 1 1350 300 1346 999 999 998 999 1357 1349 656 595 655 133 26 26 678 655 678 133 26 25 5 119 26 26 1221 650 1220 1115 1115 830 1221 683 683 1221 1221 1219 199 197 1216 1310 1310 1309 79 78 79 666 1425 1425 1336 2201 828 827 2081 2202 7 7 7 694 695 695 695 695 695 695 695 695 695 722 722 649 649 649 721 722 722 722 722 722 723 723 723 21 601 600 722 722 722 722 784 784 708 87 87 73 708 77 157 157 695 694 785 780 1275 1275 324 5 318 1248 1271 1312 1312 1267 1303 1316 1317 1317 1304 1307 1326 1326 1326 327 1326 3 1325 85 23 1315 1243 1242 1243 18 19 19 19 93 93 89 111 111 111 19 19 19 19 18 19 19 19 19 16 16 16 15 16 15 16 16 16 16 16 11 11 16 16 16 16 16 16 16 4 4 4 3 2 2 2 1416 126 122 2 2 126 183 182 183 180 179 180 61 99 246 151 247 767 767 767 2 766 767 765 35 765 765 765 594 6 767 596 596 17 41 767 767 95 96 1 96 103 103 100 100 5 96 1 99 938 182 183 3 3 150 3 644 150 144 150 564 138 939 93 13 13 13 13 1082 1084 840 320 85 1049 5 5 2067 2067 494 2062 2065 337 8 1964 1963 1963 1963 1 1 1963 2059 2060 2 506 189 62 62 506 344 505 3603 3603 256 256 1243 1242 14 9 9 9 9 1246 14 1243 1241 1224 147 49 50 1176 1224 1246 344 50 296 83 344 344 340 342 1096 248 250 1097 1096 1049 216 1139 327 207 8 62 62 50 50 50 50 14 160 99 40 40 1 98 3 97 1311 4 4 1307 221 1311 1 1 1 1 1616 1339 303 327 4 1 391 19 3723 3195 3557 3555 1001 3197 2067 5 32 999 939 70 4537 4532 4522 92 4232 4533 4533 93 3775 1 328 1503 4 4560 4558 40 4534 4551 4549 815 815 1206 1207 12 12 12 12 12 12 7 8 8 8 8 8 7 8 46 46 46 46 5 3 1 43 1 44 44 44 46 46 17 22 22 22 33680 33773 1019 924 922 129 129 1017 365 364 906 906 136 136 135 2498 2501 2498 532 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 | /* * linux/mm/memory.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds */ /* * demand-loading started 01.12.91 - seems it is high on the list of * things wanted, and it should be easy to implement. - Linus */ /* * Ok, demand-loading was easy, shared pages a little bit tricker. Shared * pages started 02.12.91, seems to work. - Linus. * * Tested sharing by executing about 30 /bin/sh: under the old kernel it * would have taken more than the 6M I have free, but it worked well as * far as I could see. * * Also corrected some "invalidate()"s - I wasn't doing enough of them. */ /* * Real VM (paging to/from disk) started 18.12.91. Much more work and * thought has to go into this. Oh, well.. * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. * Found it. Everything seems to work now. * 20.12.91 - Ok, making the swap-device changeable like the root. */ /* * 05.04.94 - Multi-page memory management added for v1.1. * Idea by Alex Bligh (alex@cconcepts.co.uk) * * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG * (Gerhard.Wichert@pdb.siemens.de) * * Aug/Sep 2004 Changed to four level page tables (Andi Kleen) */ #include <linux/kernel_stat.h> #include <linux/mm.h> #include <linux/sched/mm.h> #include <linux/sched/coredump.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/task.h> #include <linux/hugetlb.h> #include <linux/mman.h> #include <linux/swap.h> #include <linux/highmem.h> #include <linux/pagemap.h> #include <linux/memremap.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/export.h> #include <linux/delayacct.h> #include <linux/init.h> #include <linux/pfn_t.h> #include <linux/writeback.h> #include <linux/memcontrol.h> #include <linux/mmu_notifier.h> #include <linux/swapops.h> #include <linux/elf.h> #include <linux/gfp.h> #include <linux/migrate.h> #include <linux/string.h> #include <linux/dma-debug.h> #include <linux/debugfs.h> #include <linux/userfaultfd_k.h> #include <linux/dax.h> #include <linux/oom.h> #include <asm/io.h> #include <asm/mmu_context.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/pgtable.h> #include "internal.h" #if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST) #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. #endif #ifndef CONFIG_NEED_MULTIPLE_NODES /* use the per-pgdat data instead for discontigmem - mbligh */ unsigned long max_mapnr; EXPORT_SYMBOL(max_mapnr); struct page *mem_map; EXPORT_SYMBOL(mem_map); #endif /* * A number of key systems in x86 including ioremap() rely on the assumption * that high_memory defines the upper bound on direct map memory, then end * of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL * and ZONE_HIGHMEM. */ void *high_memory; EXPORT_SYMBOL(high_memory); /* * Randomize the address space (stacks, mmaps, brk, etc.). * * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization, * as ancient (libc5 based) binaries can segfault. ) */ int randomize_va_space __read_mostly = #ifdef CONFIG_COMPAT_BRK 1; #else 2; #endif #ifndef arch_faults_on_old_pte static inline bool arch_faults_on_old_pte(void) { /* * Those arches which don't have hw access flag feature need to * implement their own helper. By default, "true" means pagefault * will be hit on old pte. */ return true; } #endif static int __init disable_randmaps(char *s) { randomize_va_space = 0; return 1; } __setup("norandmaps", disable_randmaps); unsigned long zero_pfn __read_mostly; EXPORT_SYMBOL(zero_pfn); unsigned long highest_memmap_pfn __read_mostly; /* * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() */ static int __init init_zero_pfn(void) { zero_pfn = page_to_pfn(ZERO_PAGE(0)); return 0; } early_initcall(init_zero_pfn); #if defined(SPLIT_RSS_COUNTING) void sync_mm_rss(struct mm_struct *mm) { int i; for (i = 0; i < NR_MM_COUNTERS; i++) { if (current->rss_stat.count[i]) { add_mm_counter(mm, i, current->rss_stat.count[i]); current->rss_stat.count[i] = 0; } } current->rss_stat.events = 0; } static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) { struct task_struct *task = current; if (likely(task->mm == mm)) task->rss_stat.count[member] += val; else add_mm_counter(mm, member, val); } #define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1) #define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1) /* sync counter once per 64 page faults */ #define TASK_RSS_EVENTS_THRESH (64) static void check_sync_rss_stat(struct task_struct *task) { if (unlikely(task != current)) return; if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) sync_mm_rss(task->mm); } #else /* SPLIT_RSS_COUNTING */ #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) static void check_sync_rss_stat(struct task_struct *task) { } #endif /* SPLIT_RSS_COUNTING */ #ifdef HAVE_GENERIC_MMU_GATHER static bool tlb_next_batch(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; batch = tlb->active; if (batch->next) { tlb->active = batch->next; return true; } if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) return false; batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); if (!batch) return false; tlb->batch_count++; batch->next = NULL; batch->nr = 0; batch->max = MAX_GATHER_BATCH; tlb->active->next = batch; tlb->active = batch; return true; } void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) { tlb->mm = mm; /* Is it from 0 to ~0? */ tlb->fullmm = !(start | (end+1)); tlb->need_flush_all = 0; tlb->local.next = NULL; tlb->local.nr = 0; tlb->local.max = ARRAY_SIZE(tlb->__pages); tlb->active = &tlb->local; tlb->batch_count = 0; #ifdef CONFIG_HAVE_RCU_TABLE_FREE tlb->batch = NULL; #endif tlb->page_size = 0; __tlb_reset_range(tlb); } static void tlb_flush_mmu_free(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; #ifdef CONFIG_HAVE_RCU_TABLE_FREE tlb_table_flush(tlb); #endif for (batch = &tlb->local; batch && batch->nr; batch = batch->next) { free_pages_and_swap_cache(batch->pages, batch->nr); batch->nr = 0; } tlb->active = &tlb->local; } void tlb_flush_mmu(struct mmu_gather *tlb) { tlb_flush_mmu_tlbonly(tlb); tlb_flush_mmu_free(tlb); } /* tlb_finish_mmu * Called at the end of the shootdown operation to free up any resources * that were required. */ void arch_tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end, bool force) { struct mmu_gather_batch *batch, *next; if (force) __tlb_adjust_range(tlb, start, end - start); tlb_flush_mmu(tlb); /* keep the page table cache within bounds */ check_pgt_cache(); for (batch = tlb->local.next; batch; batch = next) { next = batch->next; free_pages((unsigned long)batch, 0); } tlb->local.next = NULL; } /* __tlb_remove_page * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while * handling the additional races in SMP caused by other CPUs caching valid * mappings in their TLBs. Returns the number of free page slots left. * When out of page slots we must call tlb_flush_mmu(). *returns true if the caller should flush. */ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { struct mmu_gather_batch *batch; VM_BUG_ON(!tlb->end); VM_WARN_ON(tlb->page_size != page_size); batch = tlb->active; /* * Add the page and check if we are full. If so * force a flush. */ batch->pages[batch->nr++] = page; if (batch->nr == batch->max) { if (!tlb_next_batch(tlb)) return true; batch = tlb->active; } VM_BUG_ON_PAGE(batch->nr > batch->max, page); return false; } void tlb_flush_pmd_range(struct mmu_gather *tlb, unsigned long address, unsigned long size) { if (tlb->page_size != 0 && tlb->page_size != PMD_SIZE) tlb_flush_mmu(tlb); tlb->page_size = PMD_SIZE; tlb->start = min(tlb->start, address); tlb->end = max(tlb->end, address + size); } #endif /* HAVE_GENERIC_MMU_GATHER */ #ifdef CONFIG_HAVE_RCU_TABLE_FREE /* * See the comment near struct mmu_table_batch. */ /* * If we want tlb_remove_table() to imply TLB invalidates. */ static inline void tlb_table_invalidate(struct mmu_gather *tlb) { #ifdef CONFIG_HAVE_RCU_TABLE_INVALIDATE /* * Invalidate page-table caches used by hardware walkers. Then we still * need to RCU-sched wait while freeing the pages because software * walkers can still be in-flight. */ tlb_flush_mmu_tlbonly(tlb); #endif } static void tlb_remove_table_smp_sync(void *arg) { /* Simply deliver the interrupt */ } void tlb_remove_table_sync_one(void) { smp_call_function(tlb_remove_table_smp_sync, NULL, 1); } static void tlb_remove_table_one(void *table) { /* * This isn't an RCU grace period and hence the page-tables cannot be * assumed to be actually RCU-freed. * * It is however sufficient for software page-table walkers that rely on * IRQ disabling. See the comment near struct mmu_table_batch. */ smp_call_function(tlb_remove_table_smp_sync, NULL, 1); __tlb_remove_table(table); } static void tlb_remove_table_rcu(struct rcu_head *head) { struct mmu_table_batch *batch; int i; batch = container_of(head, struct mmu_table_batch, rcu); for (i = 0; i < batch->nr; i++) __tlb_remove_table(batch->tables[i]); free_page((unsigned long)batch); } void tlb_table_flush(struct mmu_gather *tlb) { struct mmu_table_batch **batch = &tlb->batch; if (*batch) { tlb_table_invalidate(tlb); call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); *batch = NULL; } } void tlb_remove_table(struct mmu_gather *tlb, void *table) { struct mmu_table_batch **batch = &tlb->batch; if (*batch == NULL) { *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); if (*batch == NULL) { tlb_table_invalidate(tlb); tlb_remove_table_one(table); return; } (*batch)->nr = 0; } (*batch)->tables[(*batch)->nr++] = table; if ((*batch)->nr == MAX_TABLE_BATCH) tlb_table_flush(tlb); } #endif /* CONFIG_HAVE_RCU_TABLE_FREE */ /** * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down * @tlb: the mmu_gather structure to initialize * @mm: the mm_struct of the target address space * @start: start of the region that will be removed from the page-table * @end: end of the region that will be removed from the page-table * * Called to initialize an (on-stack) mmu_gather structure for page-table * tear-down from @mm. The @start and @end are set to 0 and -1 * respectively when @mm is without users and we're going to destroy * the full address space (exit/execve). */ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) { arch_tlb_gather_mmu(tlb, mm, start, end); inc_tlb_flush_pending(tlb->mm); } void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) { /* * If there are parallel threads are doing PTE changes on same range * under non-exclusive lock(e.g., mmap_sem read-side) but defer TLB * flush by batching, a thread has stable TLB entry can fail to flush * the TLB by observing pte_none|!pte_dirty, for example so flush TLB * forcefully if we detect parallel PTE batching threads. */ bool force = mm_tlb_flush_nested(tlb->mm); arch_tlb_finish_mmu(tlb, start, end, force); dec_tlb_flush_pending(tlb->mm); } /* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. */ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, unsigned long addr) { pgtable_t token = pmd_pgtable(*pmd); pmd_clear(pmd); pte_free_tlb(tlb, token, addr); mm_dec_nr_ptes(tlb->mm); } static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { pmd_t *pmd; unsigned long next; unsigned long start; start = addr; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; free_pte_range(tlb, pmd, addr); } while (pmd++, addr = next, addr != end); start &= PUD_MASK; if (start < floor) return; if (ceiling) { ceiling &= PUD_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) return; pmd = pmd_offset(pud, start); pud_clear(pud); pmd_free_tlb(tlb, pmd, start); mm_dec_nr_pmds(tlb->mm); } static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { pud_t *pud; unsigned long next; unsigned long start; start = addr; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; free_pmd_range(tlb, pud, addr, next, floor, ceiling); } while (pud++, addr = next, addr != end); start &= P4D_MASK; if (start < floor) return; if (ceiling) { ceiling &= P4D_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) return; pud = pud_offset(p4d, start); p4d_clear(p4d); pud_free_tlb(tlb, pud, start); mm_dec_nr_puds(tlb->mm); } static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { p4d_t *p4d; unsigned long next; unsigned long start; start = addr; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) continue; free_pud_range(tlb, p4d, addr, next, floor, ceiling); } while (p4d++, addr = next, addr != end); start &= PGDIR_MASK; if (start < floor) return; if (ceiling) { ceiling &= PGDIR_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) return; p4d = p4d_offset(pgd, start); pgd_clear(pgd); p4d_free_tlb(tlb, p4d, start); } /* * This function frees user-level page tables of a process. */ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { pgd_t *pgd; unsigned long next; /* * The next few lines have given us lots of grief... * * Why are we testing PMD* at this top level? Because often * there will be no work to do at all, and we'd prefer not to * go all the way down to the bottom just to discover that. * * Why all these "- 1"s? Because 0 represents both the bottom * of the address space and the top of it (using -1 for the * top wouldn't help much: the masks would do the wrong thing). * The rule is that addr 0 and floor 0 refer to the bottom of * the address space, but end 0 and ceiling 0 refer to the top * Comparisons need to use "end - 1" and "ceiling - 1" (though * that end 0 case should be mythical). * * Wherever addr is brought up or ceiling brought down, we must * be careful to reject "the opposite 0" before it confuses the * subsequent tests. But what about where end is brought down * by PMD_SIZE below? no, end can't go down to 0 there. * * Whereas we round start (addr) and ceiling down, by different * masks at different levels, in order to test whether a table * now has no other vmas using it, so can be freed, we don't * bother to round floor or end up - the tests don't need that. */ addr &= PMD_MASK; if (addr < floor) { addr += PMD_SIZE; if (!addr) return; } if (ceiling) { ceiling &= PMD_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) end -= PMD_SIZE; if (addr > end - 1) return; /* * We add page table cache pages with PAGE_SIZE, * (see pte_free_tlb()), flush the tlb if we need */ tlb_remove_check_page_size_change(tlb, PAGE_SIZE); pgd = pgd_offset(tlb->mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; free_p4d_range(tlb, pgd, addr, next, floor, ceiling); } while (pgd++, addr = next, addr != end); } void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long floor, unsigned long ceiling) { while (vma) { struct vm_area_struct *next = vma->vm_next; unsigned long addr = vma->vm_start; /* * Hide vma from rmap and truncate_pagecache before freeing * pgtables */ unlink_anon_vmas(vma); unlink_file_vma(vma); if (is_vm_hugetlb_page(vma)) { hugetlb_free_pgd_range(tlb, addr, vma->vm_end, floor, next ? next->vm_start : ceiling); } else { /* * Optimization: gather nearby vmas into one call down */ while (next && next->vm_start <= vma->vm_end + PMD_SIZE && !is_vm_hugetlb_page(next)) { vma = next; next = vma->vm_next; unlink_anon_vmas(vma); unlink_file_vma(vma); } free_pgd_range(tlb, addr, vma->vm_end, floor, next ? next->vm_start : ceiling); } vma = next; } } int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { spinlock_t *ptl; pgtable_t new = pte_alloc_one(mm, address); if (!new) return -ENOMEM; /* * Ensure all pte setup (eg. pte page lock and page clearing) are * visible before the pte is made visible to other CPUs by being * put into page tables. * * The other side of the story is the pointer chasing in the page * table walking code (when walking the page table without locking; * ie. most of the time). Fortunately, these data accesses consist * of a chain of data-dependent loads, meaning most CPUs (alpha * being the notable exception) will already guarantee loads are * seen in-order. See the alpha page table accessors for the * smp_read_barrier_depends() barriers in page table walking code. */ smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ ptl = pmd_lock(mm, pmd); if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ mm_inc_nr_ptes(mm); pmd_populate(mm, pmd, new); new = NULL; } spin_unlock(ptl); if (new) pte_free(mm, new); return 0; } int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) { pte_t *new = pte_alloc_one_kernel(&init_mm, address); if (!new) return -ENOMEM; smp_wmb(); /* See comment in __pte_alloc */ spin_lock(&init_mm.page_table_lock); if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ pmd_populate_kernel(&init_mm, pmd, new); new = NULL; } spin_unlock(&init_mm.page_table_lock); if (new) pte_free_kernel(&init_mm, new); return 0; } static inline void init_rss_vec(int *rss) { memset(rss, 0, sizeof(int) * NR_MM_COUNTERS); } static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) { int i; if (current->mm == mm) sync_mm_rss(mm); for (i = 0; i < NR_MM_COUNTERS; i++) if (rss[i]) add_mm_counter(mm, i, rss[i]); } /* * This function is called to print an error when a bad pte * is found. For example, we might have a PFN-mapped pte in * a region that doesn't allow it. * * The calling function must still handle the error. */ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, pte_t pte, struct page *page) { pgd_t *pgd = pgd_offset(vma->vm_mm, addr); p4d_t *p4d = p4d_offset(pgd, addr); pud_t *pud = pud_offset(p4d, addr); pmd_t *pmd = pmd_offset(pud, addr); struct address_space *mapping; pgoff_t index; static unsigned long resume; static unsigned long nr_shown; static unsigned long nr_unshown; /* * Allow a burst of 60 reports, then keep quiet for that minute; * or allow a steady drip of one report per second. */ if (nr_shown == 60) { if (time_before(jiffies, resume)) { nr_unshown++; return; } if (nr_unshown) { pr_alert("BUG: Bad page map: %lu messages suppressed\n", nr_unshown); nr_unshown = 0; } nr_shown = 0; } if (nr_shown++ == 0) resume = jiffies + 60 * HZ; mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; index = linear_page_index(vma, addr); pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", current->comm, (long long)pte_val(pte), (long long)pmd_val(*pmd)); if (page) dump_page(page, "bad pte"); pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n", vma->vm_file, vma->vm_ops ? vma->vm_ops->fault : NULL, vma->vm_file ? vma->vm_file->f_op->mmap : NULL, mapping ? mapping->a_ops->readpage : NULL); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } /* * vm_normal_page -- This function gets the "struct page" associated with a pte. * * "Special" mappings do not wish to be associated with a "struct page" (either * it doesn't exist, or it exists but they don't want to touch it). In this * case, NULL is returned here. "Normal" mappings do have a struct page. * * There are 2 broad cases. Firstly, an architecture may define a pte_special() * pte bit, in which case this function is trivial. Secondly, an architecture * may not have a spare pte bit, which requires a more complicated scheme, * described below. * * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a * special mapping (even if there are underlying and valid "struct pages"). * COWed pages of a VM_PFNMAP are always normal. * * The way we recognize COWed pages within VM_PFNMAP mappings is through the * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit * set, and the vm_pgoff will point to the first PFN mapped: thus every special * mapping will always honor the rule * * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) * * And for normal mappings this is false. * * This restricts such mappings to be a linear translation from virtual address * to pfn. To get around this restriction, we allow arbitrary mappings so long * as the vma is not a COW mapping; in that case, we know that all ptes are * special (because none can have been COWed). * * * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP. * * VM_MIXEDMAP mappings can likewise contain memory with or without "struct * page" backing, however the difference is that _all_ pages with a struct * page (that is, those where pfn_valid is true) are refcounted and considered * normal pages by the VM. The disadvantage is that pages are refcounted * (which can be slower and simply not an option for some PFNMAP users). The * advantage is that we don't have to follow the strict linearity rule of * PFNMAP mappings in order to support COWable mappings. * */ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte, bool with_public_device) { unsigned long pfn = pte_pfn(pte); if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) { if (likely(!pte_special(pte))) goto check_pfn; if (vma->vm_ops && vma->vm_ops->find_special_page) return vma->vm_ops->find_special_page(vma, addr); if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) return NULL; if (is_zero_pfn(pfn)) return NULL; /* * Device public pages are special pages (they are ZONE_DEVICE * pages but different from persistent memory). They behave * allmost like normal pages. The difference is that they are * not on the lru and thus should never be involve with any- * thing that involve lru manipulation (mlock, numa balancing, * ...). * * This is why we still want to return NULL for such page from * vm_normal_page() so that we do not have to special case all * call site of vm_normal_page(). */ if (likely(pfn <= highest_memmap_pfn)) { struct page *page = pfn_to_page(pfn); if (is_device_public_page(page)) { if (with_public_device) return page; return NULL; } } if (pte_devmap(pte)) return NULL; print_bad_pte(vma, addr, pte, NULL); return NULL; } /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */ if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { if (!pfn_valid(pfn)) return NULL; goto out; } else { unsigned long off; off = (addr - vma->vm_start) >> PAGE_SHIFT; if (pfn == vma->vm_pgoff + off) return NULL; if (!is_cow_mapping(vma->vm_flags)) return NULL; } } if (is_zero_pfn(pfn)) return NULL; check_pfn: if (unlikely(pfn > highest_memmap_pfn)) { print_bad_pte(vma, addr, pte, NULL); return NULL; } /* * NOTE! We still have PageReserved() pages in the page tables. * eg. VDSO mappings can cause them to exist. */ out: return pfn_to_page(pfn); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd) { unsigned long pfn = pmd_pfn(pmd); /* * There is no pmd_special() but there may be special pmds, e.g. * in a direct-access (dax) mapping, so let's just replicate the * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here. */ if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { if (!pfn_valid(pfn)) return NULL; goto out; } else { unsigned long off; off = (addr - vma->vm_start) >> PAGE_SHIFT; if (pfn == vma->vm_pgoff + off) return NULL; if (!is_cow_mapping(vma->vm_flags)) return NULL; } } if (pmd_devmap(pmd)) return NULL; if (is_zero_pfn(pfn)) return NULL; if (unlikely(pfn > highest_memmap_pfn)) return NULL; /* * NOTE! We still have PageReserved() pages in the page tables. * eg. VDSO mappings can cause them to exist. */ out: return pfn_to_page(pfn); } #endif /* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range * covered by this vma. */ static inline unsigned long copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, unsigned long addr, int *rss) { unsigned long vm_flags = vma->vm_flags; pte_t pte = *src_pte; struct page *page; /* pte contains position in swap or file, so copy. */ if (unlikely(!pte_present(pte))) { swp_entry_t entry = pte_to_swp_entry(pte); if (likely(!non_swap_entry(entry))) { if (swap_duplicate(entry) < 0) return entry.val; /* make sure dst_mm is on swapoff's mmlist. */ if (unlikely(list_empty(&dst_mm->mmlist))) { spin_lock(&mmlist_lock); if (list_empty(&dst_mm->mmlist)) list_add(&dst_mm->mmlist, &src_mm->mmlist); spin_unlock(&mmlist_lock); } rss[MM_SWAPENTS]++; } else if (is_migration_entry(entry)) { page = migration_entry_to_page(entry); rss[mm_counter(page)]++; if (is_write_migration_entry(entry) && is_cow_mapping(vm_flags)) { /* * COW mappings require pages in both * parent and child to be set to read. */ make_migration_entry_read(&entry); pte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(*src_pte)) pte = pte_swp_mksoft_dirty(pte); set_pte_at(src_mm, addr, src_pte, pte); } } else if (is_device_private_entry(entry)) { page = device_private_entry_to_page(entry); /* * Update rss count even for unaddressable pages, as * they should treated just like normal pages in this * respect. * * We will likely want to have some new rss counters * for unaddressable pages, at some point. But for now * keep things as they are. */ get_page(page); rss[mm_counter(page)]++; page_dup_rmap(page, false); /* * We do not preserve soft-dirty information, because so * far, checkpoint/restore is the only feature that * requires that. And checkpoint/restore does not work * when a device driver is involved (you cannot easily * save and restore device driver state). */ if (is_write_device_private_entry(entry) && is_cow_mapping(vm_flags)) { make_device_private_entry_read(&entry); pte = swp_entry_to_pte(entry); set_pte_at(src_mm, addr, src_pte, pte); } } goto out_set_pte; } /* * If it's a COW mapping, write protect it both * in the parent and the child */ if (is_cow_mapping(vm_flags) && pte_write(pte)) { ptep_set_wrprotect(src_mm, addr, src_pte); pte = pte_wrprotect(pte); } /* * If it's a shared mapping, mark it clean in * the child */ if (vm_flags & VM_SHARED) pte = pte_mkclean(pte); pte = pte_mkold(pte); page = vm_normal_page(vma, addr, pte); if (page) { get_page(page); page_dup_rmap(page, false); rss[mm_counter(page)]++; } else if (pte_devmap(pte)) { page = pte_page(pte); /* * Cache coherent device memory behave like regular page and * not like persistent memory page. For more informations see * MEMORY_DEVICE_CACHE_COHERENT in memory_hotplug.h */ if (is_device_public_page(page)) { get_page(page); page_dup_rmap(page, false); rss[mm_counter(page)]++; } } out_set_pte: set_pte_at(dst_mm, addr, dst_pte, pte); return 0; } static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; spinlock_t *src_ptl, *dst_ptl; int progress = 0; int rss[NR_MM_COUNTERS]; swp_entry_t entry = (swp_entry_t){0}; again: init_rss_vec(rss); dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); if (!dst_pte) return -ENOMEM; src_pte = pte_offset_map(src_pmd, addr); src_ptl = pte_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); orig_src_pte = src_pte; orig_dst_pte = dst_pte; arch_enter_lazy_mmu_mode(); do { /* * We are holding two locks at this point - either of them * could generate latencies in another task on another CPU. */ if (progress >= 32) { progress = 0; if (need_resched() || spin_needbreak(src_ptl) || spin_needbreak(dst_ptl)) break; } if (pte_none(*src_pte)) { progress++; continue; } entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); if (entry.val) break; progress += 8; } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); spin_unlock(src_ptl); pte_unmap(orig_src_pte); add_mm_rss_vec(dst_mm, rss); pte_unmap_unlock(orig_dst_pte, dst_ptl); cond_resched(); if (entry.val) { if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) return -ENOMEM; progress = 0; } if (addr != end) goto again; return 0; } static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { pmd_t *src_pmd, *dst_pmd; unsigned long next; dst_pmd = pmd_alloc(dst_mm, dst_pud, addr); if (!dst_pmd) return -ENOMEM; src_pmd = pmd_offset(src_pud, addr); do { next = pmd_addr_end(addr, end); if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) { int err; VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma); err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr, vma); if (err == -ENOMEM) return -ENOMEM; if (!err) continue; /* fall through */ } if (pmd_none_or_clear_bad(src_pmd)) continue; if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, vma, addr, next)) return -ENOMEM; } while (dst_pmd++, src_pmd++, addr = next, addr != end); return 0; } static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, p4d_t *dst_p4d, p4d_t *src_p4d, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { pud_t *src_pud, *dst_pud; unsigned long next; dst_pud = pud_alloc(dst_mm, dst_p4d, addr); if (!dst_pud) return -ENOMEM; src_pud = pud_offset(src_p4d, addr); do { next = pud_addr_end(addr, end); if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) { int err; VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma); err = copy_huge_pud(dst_mm, src_mm, dst_pud, src_pud, addr, vma); if (err == -ENOMEM) return -ENOMEM; if (!err) continue; /* fall through */ } if (pud_none_or_clear_bad(src_pud)) continue; if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, vma, addr, next)) return -ENOMEM; } while (dst_pud++, src_pud++, addr = next, addr != end); return 0; } static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { p4d_t *src_p4d, *dst_p4d; unsigned long next; dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr); if (!dst_p4d) return -ENOMEM; src_p4d = p4d_offset(src_pgd, addr); do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(src_p4d)) continue; if (copy_pud_range(dst_mm, src_mm, dst_p4d, src_p4d, vma, addr, next)) return -ENOMEM; } while (dst_p4d++, src_p4d++, addr = next, addr != end); return 0; } int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, struct vm_area_struct *vma) { pgd_t *src_pgd, *dst_pgd; unsigned long next; unsigned long addr = vma->vm_start; unsigned long end = vma->vm_end; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ bool is_cow; int ret; /* * Don't copy ptes where a page fault will fill them correctly. * Fork becomes much lighter when there are big shared or private * readonly mappings. The tradeoff is that copy_page_range is more * efficient than faulting. */ if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) && !vma->anon_vma) return 0; if (is_vm_hugetlb_page(vma)) return copy_hugetlb_page_range(dst_mm, src_mm, vma); if (unlikely(vma->vm_flags & VM_PFNMAP)) { /* * We do not free on error cases below as remove_vma * gets called on error from higher level routine */ ret = track_pfn_copy(vma); if (ret) return ret; } /* * We need to invalidate the secondary MMU mappings only when * there could be a permission downgrade on the ptes of the * parent mm. And a permission downgrade will only happen if * is_cow_mapping() returns true. */ is_cow = is_cow_mapping(vma->vm_flags); mmun_start = addr; mmun_end = end; if (is_cow) mmu_notifier_invalidate_range_start(src_mm, mmun_start, mmun_end); ret = 0; dst_pgd = pgd_offset(dst_mm, addr); src_pgd = pgd_offset(src_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(src_pgd)) continue; if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd, vma, addr, next))) { ret = -ENOMEM; break; } } while (dst_pgd++, src_pgd++, addr = next, addr != end); if (is_cow) mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end); return ret; } /* Whether we should zap all COWed (private) pages too */ static inline bool should_zap_cows(struct zap_details *details) { /* By default, zap all pages */ if (!details) return true; /* Or, we zap COWed pages only if the caller wants to */ return !details->check_mapping; } static unsigned long zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, struct zap_details *details) { struct mm_struct *mm = tlb->mm; int force_flush = 0; int rss[NR_MM_COUNTERS]; spinlock_t *ptl; pte_t *start_pte; pte_t *pte; swp_entry_t entry; tlb_remove_check_page_size_change(tlb, PAGE_SIZE); again: init_rss_vec(rss); start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); pte = start_pte; flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); do { pte_t ptent = *pte; if (pte_none(ptent)) continue; if (pte_present(ptent)) { struct page *page; page = _vm_normal_page(vma, addr, ptent, true); if (unlikely(details) && page) { /* * unmap_shared_mapping_pages() wants to * invalidate cache without truncating: * unmap shared but keep private pages. */ if (details->check_mapping && details->check_mapping != page_rmapping(page)) continue; } ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); tlb_remove_tlb_entry(tlb, pte, addr); if (unlikely(!page)) continue; if (!PageAnon(page)) { if (pte_dirty(ptent)) { force_flush = 1; set_page_dirty(page); } if (pte_young(ptent) && likely(!(vma->vm_flags & VM_SEQ_READ))) mark_page_accessed(page); } rss[mm_counter(page)]--; page_remove_rmap(page, false); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); if (unlikely(__tlb_remove_page(tlb, page))) { force_flush = 1; addr += PAGE_SIZE; break; } continue; } entry = pte_to_swp_entry(ptent); if (non_swap_entry(entry) && is_device_private_entry(entry)) { struct page *page = device_private_entry_to_page(entry); if (unlikely(details && details->check_mapping)) { /* * unmap_shared_mapping_pages() wants to * invalidate cache without truncating: * unmap shared but keep private pages. */ if (details->check_mapping != page_rmapping(page)) continue; } pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); rss[mm_counter(page)]--; page_remove_rmap(page, false); put_page(page); continue; } entry = pte_to_swp_entry(ptent); if (!non_swap_entry(entry)) { /* Genuine swap entry, hence a private anon page */ if (!should_zap_cows(details)) continue; rss[MM_SWAPENTS]--; } else if (is_migration_entry(entry)) { struct page *page; page = migration_entry_to_page(entry); if (details && details->check_mapping && details->check_mapping != page_rmapping(page)) continue; rss[mm_counter(page)]--; } if (unlikely(!free_swap_and_cache(entry))) print_bad_pte(vma, addr, ptent, NULL); pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, addr != end); add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); /* Do the actual TLB flush before dropping ptl */ if (force_flush) tlb_flush_mmu_tlbonly(tlb); pte_unmap_unlock(start_pte, ptl); /* * If we forced a TLB flush (either due to running out of * batch buffers or because we needed to flush dirty TLB * entries before releasing the ptl), free the batched * memory too. Restart if we didn't do everything. */ if (force_flush) { force_flush = 0; tlb_flush_mmu_free(tlb); if (addr != end) goto again; } return addr; } static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, struct zap_details *details) { pmd_t *pmd; unsigned long next; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) __split_huge_pmd(vma, pmd, addr, false, NULL); else if (zap_huge_pmd(tlb, vma, pmd, addr)) goto next; /* fall through */ } else if (details && details->single_page && PageTransCompound(details->single_page) && next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) { spinlock_t *ptl = pmd_lock(tlb->mm, pmd); /* * Take and drop THP pmd lock so that we cannot return * prematurely, while zap_huge_pmd() has cleared *pmd, * but not yet decremented compound_mapcount(). */ spin_unlock(ptl); } /* * Here there can be other concurrent MADV_DONTNEED or * trans huge page faults running, and if the pmd is * none or trans huge it can change under us. This is * because MADV_DONTNEED holds the mmap_sem in read * mode. */ if (pmd_none_or_trans_huge_or_clear_bad(pmd)) goto next; next = zap_pte_range(tlb, vma, pmd, addr, next, details); next: cond_resched(); } while (pmd++, addr = next, addr != end); return addr; } static inline unsigned long zap_pud_range(struct mmu_gather *tlb, struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, struct zap_details *details) { pud_t *pud; unsigned long next; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); if (pud_trans_huge(*pud) || pud_devmap(*pud)) { if (next - addr != HPAGE_PUD_SIZE) { VM_BUG_ON_VMA(!rwsem_is_locked(&tlb->mm->mmap_sem), vma); split_huge_pud(vma, pud, addr); } else if (zap_huge_pud(tlb, vma, pud, addr)) goto next; /* fall through */ } if (pud_none_or_clear_bad(pud)) continue; next = zap_pmd_range(tlb, vma, pud, addr, next, details); next: cond_resched(); } while (pud++, addr = next, addr != end); return addr; } static inline unsigned long zap_p4d_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, struct zap_details *details) { p4d_t *p4d; unsigned long next; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) continue; next = zap_pud_range(tlb, vma, p4d, addr, next, details); } while (p4d++, addr = next, addr != end); return addr; } void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct zap_details *details) { pgd_t *pgd; unsigned long next; BUG_ON(addr >= end); tlb_start_vma(tlb, vma); pgd = pgd_offset(vma->vm_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; next = zap_p4d_range(tlb, vma, pgd, addr, next, details); } while (pgd++, addr = next, addr != end); tlb_end_vma(tlb, vma); } static void unmap_single_vma(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, struct zap_details *details) { unsigned long start = max(vma->vm_start, start_addr); unsigned long end; if (start >= vma->vm_end) return; end = min(vma->vm_end, end_addr); if (end <= vma->vm_start) return; if (vma->vm_file) uprobe_munmap(vma, start, end); if (unlikely(vma->vm_flags & VM_PFNMAP)) untrack_pfn(vma, 0, 0); if (start != end) { if (unlikely(is_vm_hugetlb_page(vma))) { /* * It is undesirable to test vma->vm_file as it * should be non-null for valid hugetlb area. * However, vm_file will be NULL in the error * cleanup path of mmap_region. When * hugetlbfs ->mmap method fails, * mmap_region() nullifies vma->vm_file * before calling this function to clean up. * Since no pte has actually been setup, it is * safe to do nothing in this case. */ if (vma->vm_file) { i_mmap_lock_write(vma->vm_file->f_mapping); __unmap_hugepage_range_final(tlb, vma, start, end, NULL); i_mmap_unlock_write(vma->vm_file->f_mapping); } } else unmap_page_range(tlb, vma, start, end, details); } } /** * unmap_vmas - unmap a range of memory covered by a list of vma's * @tlb: address of the caller's struct mmu_gather * @vma: the starting vma * @start_addr: virtual address at which to start unmapping * @end_addr: virtual address at which to end unmapping * * Unmap all pages in the vma list. * * Only addresses between `start' and `end' will be unmapped. * * The VMA list must be sorted in ascending virtual address order. * * unmap_vmas() assumes that the caller will flush the whole unmapped address * range after unmap_vmas() returns. So the only responsibility here is to * ensure that any thus-far unmapped pages are flushed before unmap_vmas() * drops the lock and schedules. */ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr) { struct mm_struct *mm = vma->vm_mm; mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) unmap_single_vma(tlb, vma, start_addr, end_addr, NULL); mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); } /** * zap_page_range - remove user pages in a given range * @vma: vm_area_struct holding the applicable pages * @start: starting address of pages to zap * @size: number of bytes to zap * * Caller must protect the VMA list */ void zap_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long size) { struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; unsigned long end = start + size; lru_add_drain(); tlb_gather_mmu(&tlb, mm, start, end); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(mm, start, end); for ( ; vma && vma->vm_start < end; vma = vma->vm_next) unmap_single_vma(&tlb, vma, start, end, NULL); mmu_notifier_invalidate_range_end(mm, start, end); tlb_finish_mmu(&tlb, start, end); } /** * zap_page_range_single - remove user pages in a given range * @vma: vm_area_struct holding the applicable pages * @address: starting address of pages to zap * @size: number of bytes to zap * @details: details of shared cache invalidation * * The range must fit into one VMA. */ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details) { struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; unsigned long end = address + size; lru_add_drain(); tlb_gather_mmu(&tlb, mm, address, end); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(mm, address, end); unmap_single_vma(&tlb, vma, address, end, details); mmu_notifier_invalidate_range_end(mm, address, end); tlb_finish_mmu(&tlb, address, end); } /** * zap_vma_ptes - remove ptes mapping the vma * @vma: vm_area_struct holding ptes to be zapped * @address: starting address of pages to zap * @size: number of bytes to zap * * This function only unmaps ptes assigned to VM_PFNMAP vmas. * * The entire address range must be fully contained within the vma. * */ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size) { if (address < vma->vm_start || address + size > vma->vm_end || !(vma->vm_flags & VM_PFNMAP)) return; zap_page_range_single(vma, address, size, NULL); } EXPORT_SYMBOL_GPL(zap_vma_ptes); pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgd = pgd_offset(mm, addr); p4d = p4d_alloc(mm, pgd, addr); if (!p4d) return NULL; pud = pud_alloc(mm, p4d, addr); if (!pud) return NULL; pmd = pmd_alloc(mm, pud, addr); if (!pmd) return NULL; VM_BUG_ON(pmd_trans_huge(*pmd)); return pte_alloc_map_lock(mm, pmd, addr, ptl); } /* * This is the old fallback for page remapping. * * For historical reasons, it only allows reserved pages. Only * old drivers should use this, and they needed to mark their * pages reserved for the old functions anyway. */ static int insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot) { struct mm_struct *mm = vma->vm_mm; int retval; pte_t *pte; spinlock_t *ptl; retval = -EINVAL; if (PageAnon(page)) goto out; retval = -ENOMEM; flush_dcache_page(page); pte = get_locked_pte(mm, addr, &ptl); if (!pte) goto out; retval = -EBUSY; if (!pte_none(*pte)) goto out_unlock; /* Ok, finally just insert the thing.. */ get_page(page); inc_mm_counter_fast(mm, mm_counter_file(page)); page_add_file_rmap(page, false); set_pte_at(mm, addr, pte, mk_pte(page, prot)); retval = 0; pte_unmap_unlock(pte, ptl); return retval; out_unlock: pte_unmap_unlock(pte, ptl); out: return retval; } /** * vm_insert_page - insert single page into user vma * @vma: user vma to map to * @addr: target user address of this page * @page: source kernel page * * This allows drivers to insert individual pages they've allocated * into a user vma. * * The page has to be a nice clean _individual_ kernel allocation. * If you allocate a compound page, you need to have marked it as * such (__GFP_COMP), or manually just split the page up yourself * (see split_page()). * * NOTE! Traditionally this was done with "remap_pfn_range()" which * took an arbitrary page protection parameter. This doesn't allow * that. Your vma protection will have to be set up correctly, which * means that if you want a shared writable mapping, you'd better * ask for a shared writable mapping! * * The page does not need to be reserved. * * Usually this function is called from f_op->mmap() handler * under mm->mmap_sem write-lock, so it can change vma->vm_flags. * Caller must set VM_MIXEDMAP on vma if it wants to call this * function from other places, for example from page-fault handler. */ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) { if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; if (!page_count(page)) return -EINVAL; if (!(vma->vm_flags & VM_MIXEDMAP)) { BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem)); BUG_ON(vma->vm_flags & VM_PFNMAP); vma->vm_flags |= VM_MIXEDMAP; } return insert_page(vma, addr, page, vma->vm_page_prot); } EXPORT_SYMBOL(vm_insert_page); static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn, pgprot_t prot, bool mkwrite) { struct mm_struct *mm = vma->vm_mm; int retval; pte_t *pte, entry; spinlock_t *ptl; retval = -ENOMEM; pte = get_locked_pte(mm, addr, &ptl); if (!pte) goto out; retval = -EBUSY; if (!pte_none(*pte)) { if (mkwrite) { /* * For read faults on private mappings the PFN passed * in may not match the PFN we have mapped if the * mapped PFN is a writeable COW page. In the mkwrite * case we are creating a writable PTE for a shared * mapping and we expect the PFNs to match. If they * don't match, we are likely racing with block * allocation and mapping invalidation so just skip the * update. */ if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) { WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte))); goto out_unlock; } entry = pte_mkyoung(*pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, addr, pte, entry, 1)) update_mmu_cache(vma, addr, pte); } goto out_unlock; } /* Ok, finally just insert the thing.. */ if (pfn_t_devmap(pfn)) entry = pte_mkdevmap(pfn_t_pte(pfn, prot)); else entry = pte_mkspecial(pfn_t_pte(pfn, prot)); if (mkwrite) { entry = pte_mkyoung(entry); entry = maybe_mkwrite(pte_mkdirty(entry), vma); } set_pte_at(mm, addr, pte, entry); update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ retval = 0; out_unlock: pte_unmap_unlock(pte, ptl); out: return retval; } /** * vm_insert_pfn - insert single pfn into user vma * @vma: user vma to map to * @addr: target user address of this page * @pfn: source kernel pfn * * Similar to vm_insert_page, this allows drivers to insert individual pages * they've allocated into a user vma. Same comments apply. * * This function should only be called from a vm_ops->fault handler, and * in that case the handler should return NULL. * * vma cannot be a COW mapping. * * As this is called only for pages that do not currently exist, we * do not need to flush old virtual caches or the TLB. */ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn) { return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); } EXPORT_SYMBOL(vm_insert_pfn); /** * vm_insert_pfn_prot - insert single pfn into user vma with specified pgprot * @vma: user vma to map to * @addr: target user address of this page * @pfn: source kernel pfn * @pgprot: pgprot flags for the inserted page * * This is exactly like vm_insert_pfn, except that it allows drivers to * to override pgprot on a per-page basis. * * This only makes sense for IO mappings, and it makes no sense for * cow mappings. In general, using multiple vmas is preferable; * vm_insert_pfn_prot should only be used if using multiple VMAs is * impractical. */ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot) { int ret; /* * Technically, architectures with pte_special can avoid all these * restrictions (same for remap_pfn_range). However we would like * consistency in testing and feature parity among all, so we should * try to keep these invariants in place for everybody. */ BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == (VM_PFNMAP|VM_MIXEDMAP)); BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; if (!pfn_modify_allowed(pfn, pgprot)) return -EACCES; track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, false); return ret; } EXPORT_SYMBOL(vm_insert_pfn_prot); static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn) { /* these checks mirror the abort conditions in vm_normal_page */ if (vma->vm_flags & VM_MIXEDMAP) return true; if (pfn_t_devmap(pfn)) return true; if (pfn_t_special(pfn)) return true; if (is_zero_pfn(pfn_t_to_pfn(pfn))) return true; return false; } static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn, bool mkwrite) { pgprot_t pgprot = vma->vm_page_prot; BUG_ON(!vm_mixed_ok(vma, pfn)); if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; track_pfn_insert(vma, &pgprot, pfn); if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot)) return -EACCES; /* * If we don't have pte special, then we have to use the pfn_valid() * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* * refcount the page if pfn_valid is true (hence insert_page rather * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP * without pte special, it would there be refcounted as a normal page. */ if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) { struct page *page; /* * At this point we are committed to insert_page() * regardless of whether the caller specified flags that * result in pfn_t_has_page() == false. */ page = pfn_to_page(pfn_t_to_pfn(pfn)); return insert_page(vma, addr, page, pgprot); } return insert_pfn(vma, addr, pfn, pgprot, mkwrite); } int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn) { return __vm_insert_mixed(vma, addr, pfn, false); } EXPORT_SYMBOL(vm_insert_mixed); /* * If the insertion of PTE failed because someone else already added a * different entry in the mean time, we treat that as success as we assume * the same entry was actually inserted. */ vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn) { int err; err = __vm_insert_mixed(vma, addr, pfn, true); if (err == -ENOMEM) return VM_FAULT_OOM; if (err < 0 && err != -EBUSY) return VM_FAULT_SIGBUS; return VM_FAULT_NOPAGE; } EXPORT_SYMBOL(vmf_insert_mixed_mkwrite); /* * maps a range of physical memory into the requested pages. the old * mappings are removed. any references to nonexistent pages results * in null mappings (currently treated as "copy-on-access") */ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) { pte_t *pte, *mapped_pte; spinlock_t *ptl; int err = 0; mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; arch_enter_lazy_mmu_mode(); do { BUG_ON(!pte_none(*pte)); if (!pfn_modify_allowed(pfn, prot)) { err = -EACCES; break; } set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(mapped_pte, ptl); return err; } static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) { pmd_t *pmd; unsigned long next; int err; pfn -= addr >> PAGE_SHIFT; pmd = pmd_alloc(mm, pud, addr); if (!pmd) return -ENOMEM; VM_BUG_ON(pmd_trans_huge(*pmd)); do { next = pmd_addr_end(addr, end); err = remap_pte_range(mm, pmd, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) return err; } while (pmd++, addr = next, addr != end); return 0; } static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) { pud_t *pud; unsigned long next; int err; pfn -= addr >> PAGE_SHIFT; pud = pud_alloc(mm, p4d, addr); if (!pud) return -ENOMEM; do { next = pud_addr_end(addr, end); err = remap_pmd_range(mm, pud, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) return err; } while (pud++, addr = next, addr != end); return 0; } static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) { p4d_t *p4d; unsigned long next; int err; pfn -= addr >> PAGE_SHIFT; p4d = p4d_alloc(mm, pgd, addr); if (!p4d) return -ENOMEM; do { next = p4d_addr_end(addr, end); err = remap_pud_range(mm, p4d, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) return err; } while (p4d++, addr = next, addr != end); return 0; } /** * remap_pfn_range - remap kernel memory to userspace * @vma: user vma to map to * @addr: target user address to start at * @pfn: physical address of kernel memory * @size: size of map area * @prot: page protection flags for this mapping * * Note: this is only safe if the mm semaphore is held when called. */ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { pgd_t *pgd; unsigned long next; unsigned long end = addr + PAGE_ALIGN(size); struct mm_struct *mm = vma->vm_mm; unsigned long remap_pfn = pfn; int err; /* * Physically remapped pages are special. Tell the * rest of the world about it: * VM_IO tells people not to look at these pages * (accesses can have side effects). * VM_PFNMAP tells the core MM that the base pages are just * raw PFN mappings, and do not have a "struct page" associated * with them. * VM_DONTEXPAND * Disable vma merging and expanding with mremap(). * VM_DONTDUMP * Omit vma from core dump, even when VM_IO turned off. * * There's a horrible special case to handle copy-on-write * behaviour that some programs depend on. We mark the "original" * un-COW'ed pages by matching them up with "vma->vm_pgoff". * See vm_normal_page() for details. */ if (is_cow_mapping(vma->vm_flags)) { if (addr != vma->vm_start || end != vma->vm_end) return -EINVAL; vma->vm_pgoff = pfn; } err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size)); if (err) return -EINVAL; vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; pgd = pgd_offset(mm, addr); flush_cache_range(vma, addr, end); do { next = pgd_addr_end(addr, end); err = remap_p4d_range(mm, pgd, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) break; } while (pgd++, addr = next, addr != end); if (err) untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size)); return err; } EXPORT_SYMBOL(remap_pfn_range); /** * vm_iomap_memory - remap memory to userspace * @vma: user vma to map to * @start: start of area * @len: size of area * * This is a simplified io_remap_pfn_range() for common driver use. The * driver just needs to give us the physical memory range to be mapped, * we'll figure out the rest from the vma information. * * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get * whatever write-combining details or similar. */ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) { unsigned long vm_len, pfn, pages; /* Check that the physical memory area passed in looks valid */ if (start + len < start) return -EINVAL; /* * You *really* shouldn't map things that aren't page-aligned, * but we've historically allowed it because IO memory might * just have smaller alignment. */ len += start & ~PAGE_MASK; pfn = start >> PAGE_SHIFT; pages = (len + ~PAGE_MASK) >> PAGE_SHIFT; if (pfn + pages < pfn) return -EINVAL; /* We start the mapping 'vm_pgoff' pages into the area */ if (vma->vm_pgoff > pages) return -EINVAL; pfn += vma->vm_pgoff; pages -= vma->vm_pgoff; /* Can we fit all of the mapping? */ vm_len = vma->vm_end - vma->vm_start; if (vm_len >> PAGE_SHIFT > pages) return -EINVAL; /* Ok, let it rip */ return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); } EXPORT_SYMBOL(vm_iomap_memory); static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, pte_fn_t fn, void *data) { pte_t *pte; int err; pgtable_t token; spinlock_t *uninitialized_var(ptl); pte = (mm == &init_mm) ? pte_alloc_kernel(pmd, addr) : pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; BUG_ON(pmd_huge(*pmd)); arch_enter_lazy_mmu_mode(); token = pmd_pgtable(*pmd); do { err = fn(pte++, token, addr, data); if (err) break; } while (addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); if (mm != &init_mm) pte_unmap_unlock(pte-1, ptl); return err; } static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, unsigned long addr, unsigned long end, pte_fn_t fn, void *data) { pmd_t *pmd; unsigned long next; int err; BUG_ON(pud_huge(*pud)); pmd = pmd_alloc(mm, pud, addr); if (!pmd) return -ENOMEM; do { next = pmd_addr_end(addr, end); err = apply_to_pte_range(mm, pmd, addr, next, fn, data); if (err) break; } while (pmd++, addr = next, addr != end); return err; } static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d, unsigned long addr, unsigned long end, pte_fn_t fn, void *data) { pud_t *pud; unsigned long next; int err; pud = pud_alloc(mm, p4d, addr); if (!pud) return -ENOMEM; do { next = pud_addr_end(addr, end); err = apply_to_pmd_range(mm, pud, addr, next, fn, data); if (err) break; } while (pud++, addr = next, addr != end); return err; } static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, unsigned long end, pte_fn_t fn, void *data) { p4d_t *p4d; unsigned long next; int err; p4d = p4d_alloc(mm, pgd, addr); if (!p4d) return -ENOMEM; do { next = p4d_addr_end(addr, end); err = apply_to_pud_range(mm, p4d, addr, next, fn, data); if (err) break; } while (p4d++, addr = next, addr != end); return err; } /* * Scan a region of virtual memory, filling in page tables as necessary * and calling a provided function on each leaf page table. */ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, unsigned long size, pte_fn_t fn, void *data) { pgd_t *pgd; unsigned long next; unsigned long end = addr + size; int err; if (WARN_ON(addr >= end)) return -EINVAL; pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end); err = apply_to_p4d_range(mm, pgd, addr, next, fn, data); if (err) break; } while (pgd++, addr = next, addr != end); return err; } EXPORT_SYMBOL_GPL(apply_to_page_range); /* * handle_pte_fault chooses page fault handler according to an entry which was * read non-atomically. Before making any commitment, on those architectures * or configurations (e.g. i386 with PAE) which might give a mix of unmatched * parts, do_swap_page must check under lock before unmapping the pte and * proceeding (but do_wp_page is only called after already making such a check; * and do_anonymous_page can safely check later on). */ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, pte_t *page_table, pte_t orig_pte) { int same = 1; #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) if (sizeof(pte_t) > sizeof(unsigned long)) { spinlock_t *ptl = pte_lockptr(mm, pmd); spin_lock(ptl); same = pte_same(*page_table, orig_pte); spin_unlock(ptl); } #endif pte_unmap(page_table); return same; } static inline bool cow_user_page(struct page *dst, struct page *src, struct vm_fault *vmf) { bool ret; void *kaddr; void __user *uaddr; bool locked = false; struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; unsigned long addr = vmf->address; debug_dma_assert_idle(src); if (likely(src)) { copy_user_highpage(dst, src, addr, vma); return true; } /* * If the source page was a PFN mapping, we don't have * a "struct page" for it. We do a best-effort copy by * just copying from the original user address. If that * fails, we just zero-fill it. Live with it. */ kaddr = kmap_atomic(dst); uaddr = (void __user *)(addr & PAGE_MASK); /* * On architectures with software "accessed" bits, we would * take a double page fault, so mark it accessed here. */ if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) { pte_t entry; vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); locked = true; if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) { /* * Other thread has already handled the fault * and we don't need to do anything. If it's * not the case, the fault will be triggered * again on the same address. */ ret = false; goto pte_unlock; } entry = pte_mkyoung(vmf->orig_pte); if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0)) update_mmu_cache(vma, addr, vmf->pte); } /* * This really shouldn't fail, because the page is there * in the page tables. But it might just be unreadable, * in which case we just give up and fill the result with * zeroes. */ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { if (locked) goto warn; /* Re-validate under PTL if the page is still mapped */ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); locked = true; if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) { /* The PTE changed under us. Retry page fault. */ ret = false; goto pte_unlock; } /* * The same page can be mapped back since last copy attampt. * Try to copy again under PTL. */ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { /* * Give a warn in case there can be some obscure * use-case */ warn: WARN_ON_ONCE(1); clear_page(kaddr); } } ret = true; pte_unlock: if (locked) pte_unmap_unlock(vmf->pte, vmf->ptl); kunmap_atomic(kaddr); flush_dcache_page(dst); return ret; } static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) { struct file *vm_file = vma->vm_file; if (vm_file) return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO; /* * Special mappings (e.g. VDSO) do not have any file so fake * a default GFP_KERNEL for them. */ return GFP_KERNEL; } /* * Notify the address space that the page is about to become writable so that * it can prohibit this or wait for the page to get into an appropriate state. * * We do this without the lock held, so that it can sleep if it needs to. */ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf) { vm_fault_t ret; struct page *page = vmf->page; unsigned int old_flags = vmf->flags; vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; ret = vmf->vma->vm_ops->page_mkwrite(vmf); /* Restore original flags so that caller is not surprised */ vmf->flags = old_flags; if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) return ret; if (unlikely(!(ret & VM_FAULT_LOCKED))) { lock_page(page); if (!page->mapping) { unlock_page(page); return 0; /* retry */ } ret |= VM_FAULT_LOCKED; } else VM_BUG_ON_PAGE(!PageLocked(page), page); return ret; } /* * Handle dirtying of a page in shared file mapping on a write fault. * * The function expects the page to be locked and unlocks it. */ static void fault_dirty_shared_page(struct vm_area_struct *vma, struct page *page) { struct address_space *mapping; bool dirtied; bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite; dirtied = set_page_dirty(page); VM_BUG_ON_PAGE(PageAnon(page), page); /* * Take a local copy of the address_space - page.mapping may be zeroed * by truncate after unlock_page(). The address_space itself remains * pinned by vma->vm_file's reference. We rely on unlock_page()'s * release semantics to prevent the compiler from undoing this copying. */ mapping = page_rmapping(page); unlock_page(page); if ((dirtied || page_mkwrite) && mapping) { /* * Some device drivers do not set page.mapping * but still dirty their pages */ balance_dirty_pages_ratelimited(mapping); } if (!page_mkwrite) file_update_time(vma->vm_file); } /* * Handle write page faults for pages that can be reused in the current vma * * This can happen either due to the mapping being with the VM_SHARED flag, * or due to us being the last reference standing to the page. In either * case, all we need to do here is to mark the page as writable and update * any related book-keeping. */ static inline void wp_page_reuse(struct vm_fault *vmf) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; struct page *page = vmf->page; pte_t entry; /* * Clear the pages cpupid information as the existing * information potentially belongs to a now completely * unrelated process. */ if (page) page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = pte_mkyoung(vmf->orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1)) update_mmu_cache(vma, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); } /* * Handle the case of a page which we actually need to copy to a new page. * * Called with mmap_sem locked and the old page referenced, but * without the ptl held. * * High level logic flow: * * - Allocate a page, copy the content of the old page to the new one. * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc. * - Take the PTL. If the pte changed, bail out and release the allocated page * - If the pte is still the way we remember it, update the page table and all * relevant references. This includes dropping the reference the page-table * held to the old page, as well as updating the rmap. * - In any case, unlock the PTL and drop the reference we took to the old page. */ static vm_fault_t wp_page_copy(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; struct page *old_page = vmf->page; struct page *new_page = NULL; pte_t entry; int page_copied = 0; const unsigned long mmun_start = vmf->address & PAGE_MASK; const unsigned long mmun_end = mmun_start + PAGE_SIZE; struct mem_cgroup *memcg; if (unlikely(anon_vma_prepare(vma))) goto oom; if (is_zero_pfn(pte_pfn(vmf->orig_pte))) { new_page = alloc_zeroed_user_highpage_movable(vma, vmf->address); if (!new_page) goto oom; } else { new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); if (!new_page) goto oom; if (!cow_user_page(new_page, old_page, vmf)) { /* * COW failed, if the fault was solved by other, * it's fine. If not, userspace would re-fault on * the same address and we will handle the fault * from the second attempt. */ put_page(new_page); if (old_page) put_page(old_page); return 0; } } if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false)) goto oom_free_new; __SetPageUptodate(new_page); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); /* * Re-check the pte - we dropped the lock */ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { dec_mm_counter_fast(mm, mm_counter_file(old_page)); inc_mm_counter_fast(mm, MM_ANONPAGES); } } else { inc_mm_counter_fast(mm, MM_ANONPAGES); } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); /* * Clear the pte entry and flush it first, before updating the * pte with the new entry. This will avoid a race condition * seen in the presence of one thread doing SMC and another * thread doing COW. */ ptep_clear_flush_notify(vma, vmf->address, vmf->pte); page_add_new_anon_rmap(new_page, vma, vmf->address, false); mem_cgroup_commit_charge(new_page, memcg, false, false); lru_cache_add_active_or_unevictable(new_page, vma); /* * We call the notify macro here because, when using secondary * mmu page tables (such as kvm shadow page tables), we want the * new page to be mapped directly into the secondary page table. */ set_pte_at_notify(mm, vmf->address, vmf->pte, entry); update_mmu_cache(vma, vmf->address, vmf->pte); if (old_page) { /* * Only after switching the pte to the new page may * we remove the mapcount here. Otherwise another * process may come and find the rmap count decremented * before the pte is switched to the new page, and * "reuse" the old page writing into it while our pte * here still points into it and can be read by other * threads. * * The critical issue is to order this * page_remove_rmap with the ptp_clear_flush above. * Those stores are ordered by (if nothing else,) * the barrier present in the atomic_add_negative * in page_remove_rmap. * * Then the TLB flush in ptep_clear_flush ensures that * no process can access the old page before the * decremented mapcount is visible. And the old page * cannot be reused until after the decremented * mapcount is visible. So transitively, TLBs to * old page will be flushed before it can be reused. */ page_remove_rmap(old_page, false); } /* Free the old page.. */ new_page = old_page; page_copied = 1; } else { mem_cgroup_cancel_charge(new_page, memcg, false); } if (new_page) put_page(new_page); pte_unmap_unlock(vmf->pte, vmf->ptl); /* * No need to double call mmu_notifier->invalidate_range() callback as * the above ptep_clear_flush_notify() did already call it. */ mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end); if (old_page) { /* * Don't let another task, with possibly unlocked vma, * keep the mlocked page. */ if (page_copied && (vma->vm_flags & VM_LOCKED)) { lock_page(old_page); /* LRU manipulation */ if (PageMlocked(old_page)) munlock_vma_page(old_page); unlock_page(old_page); } put_page(old_page); } return page_copied ? VM_FAULT_WRITE : 0; oom_free_new: put_page(new_page); oom: if (old_page) put_page(old_page); return VM_FAULT_OOM; } /** * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE * writeable once the page is prepared * * @vmf: structure describing the fault * * This function handles all that is needed to finish a write page fault in a * shared mapping due to PTE being read-only once the mapped page is prepared. * It handles locking of PTE and modifying it. The function returns * VM_FAULT_WRITE on success, 0 when PTE got changed before we acquired PTE * lock. * * The function expects the page to be locked or other protection against * concurrent faults / writeback (such as DAX radix tree locks). */ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) { WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); /* * We might have raced with another page fault while we released the * pte_offset_map_lock. */ if (!pte_same(*vmf->pte, vmf->orig_pte)) { pte_unmap_unlock(vmf->pte, vmf->ptl); return VM_FAULT_NOPAGE; } wp_page_reuse(vmf); return 0; } /* * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED * mapping */ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { vm_fault_t ret; pte_unmap_unlock(vmf->pte, vmf->ptl); vmf->flags |= FAULT_FLAG_MKWRITE; ret = vma->vm_ops->pfn_mkwrite(vmf); if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)) return ret; return finish_mkwrite_fault(vmf); } wp_page_reuse(vmf); return VM_FAULT_WRITE; } static vm_fault_t wp_page_shared(struct vm_fault *vmf) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; get_page(vmf->page); if (vma->vm_ops && vma->vm_ops->page_mkwrite) { vm_fault_t tmp; pte_unmap_unlock(vmf->pte, vmf->ptl); tmp = do_page_mkwrite(vmf); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { put_page(vmf->page); return tmp; } tmp = finish_mkwrite_fault(vmf); if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { unlock_page(vmf->page); put_page(vmf->page); return tmp; } } else { wp_page_reuse(vmf); lock_page(vmf->page); } fault_dirty_shared_page(vma, vmf->page); put_page(vmf->page); return VM_FAULT_WRITE; } /* * This routine handles present pages, when users try to write * to a shared page. It is done by copying the page to a new address * and decrementing the shared-page counter for the old page. * * Note that this routine assumes that the protection checks have been * done by the caller (the low-level page fault routine in most cases). * Thus we can safely just mark it writable once we've done any necessary * COW. * * We also mark the page dirty at this point even though the page will * change only once the write actually happens. This avoids a few races, * and potentially makes it more efficient. * * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), with pte both mapped and locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ static vm_fault_t do_wp_page(struct vm_fault *vmf) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); if (!vmf->page) { /* * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a * VM_PFNMAP VMA. * * We should not cow pages in a shared writeable mapping. * Just mark the pages writable and/or call ops->pfn_mkwrite. */ if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED)) return wp_pfn_shared(vmf); pte_unmap_unlock(vmf->pte, vmf->ptl); return wp_page_copy(vmf); } /* * Take out anonymous pages first, anonymous shared vmas are * not dirty accountable. */ if (PageAnon(vmf->page) && !PageKsm(vmf->page)) { int total_map_swapcount; if (!trylock_page(vmf->page)) { get_page(vmf->page); pte_unmap_unlock(vmf->pte, vmf->ptl); lock_page(vmf->page); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (!pte_same(*vmf->pte, vmf->orig_pte)) { unlock_page(vmf->page); pte_unmap_unlock(vmf->pte, vmf->ptl); put_page(vmf->page); return 0; } put_page(vmf->page); } if (reuse_swap_page(vmf->page, &total_map_swapcount)) { if (total_map_swapcount == 1) { /* * The page is all ours. Move it to * our anon_vma so the rmap code will * not search our parent or siblings. * Protected against the rmap code by * the page lock. */ page_move_anon_rmap(vmf->page, vma); } unlock_page(vmf->page); wp_page_reuse(vmf); return VM_FAULT_WRITE; } unlock_page(vmf->page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { return wp_page_shared(vmf); } /* * Ok, we need to copy. Oh, well.. */ get_page(vmf->page); pte_unmap_unlock(vmf->pte, vmf->ptl); return wp_page_copy(vmf); } static void unmap_mapping_range_vma(struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, struct zap_details *details) { zap_page_range_single(vma, start_addr, end_addr - start_addr, details); } static inline void unmap_mapping_range_tree(struct rb_root_cached *root, struct zap_details *details) { struct vm_area_struct *vma; pgoff_t vba, vea, zba, zea; vma_interval_tree_foreach(vma, root, details->first_index, details->last_index) { vba = vma->vm_pgoff; vea = vba + vma_pages(vma) - 1; zba = details->first_index; if (zba < vba) zba = vba; zea = details->last_index; if (zea > vea) zea = vea; unmap_mapping_range_vma(vma, ((zba - vba) << PAGE_SHIFT) + vma->vm_start, ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, details); } } /** * unmap_mapping_page() - Unmap single page from processes. * @page: The locked page to be unmapped. * * Unmap this page from any userspace process which still has it mmaped. * Typically, for efficiency, the range of nearby pages has already been * unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once * truncation or invalidation holds the lock on a page, it may find that * the page has been remapped again: and then uses unmap_mapping_page() * to unmap it finally. */ void unmap_mapping_page(struct page *page) { struct address_space *mapping = page->mapping; struct zap_details details = { }; VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(PageTail(page)); details.check_mapping = mapping; details.first_index = page->index; details.last_index = page->index + hpage_nr_pages(page) - 1; details.single_page = page; i_mmap_lock_write(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) unmap_mapping_range_tree(&mapping->i_mmap, &details); i_mmap_unlock_write(mapping); } /** * unmap_mapping_pages() - Unmap pages from processes. * @mapping: The address space containing pages to be unmapped. * @start: Index of first page to be unmapped. * @nr: Number of pages to be unmapped. 0 to unmap to end of file. * @even_cows: Whether to unmap even private COWed pages. * * Unmap the pages in this address space from any userspace process which * has them mmaped. Generally, you want to remove COWed pages as well when * a file is being truncated, but not when invalidating pages from the page * cache. */ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows) { struct zap_details details = { }; details.check_mapping = even_cows ? NULL : mapping; details.first_index = start; details.last_index = start + nr - 1; if (details.last_index < details.first_index) details.last_index = ULONG_MAX; i_mmap_lock_write(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) unmap_mapping_range_tree(&mapping->i_mmap, &details); i_mmap_unlock_write(mapping); } /** * unmap_mapping_range - unmap the portion of all mmaps in the specified * address_space corresponding to the specified byte range in the underlying * file. * * @mapping: the address space containing mmaps to be unmapped. * @holebegin: byte in first page to unmap, relative to the start of * the underlying file. This will be rounded down to a PAGE_SIZE * boundary. Note that this is different from truncate_pagecache(), which * must keep the partial page. In contrast, we must get rid of * partial pages. * @holelen: size of prospective hole in bytes. This will be rounded * up to a PAGE_SIZE boundary. A holelen of zero truncates to the * end of the file. * @even_cows: 1 when truncating a file, unmap even private COWed pages; * but 0 when invalidating pagecache, don't throw away private data. */ void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) { pgoff_t hba = holebegin >> PAGE_SHIFT; pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; /* Check for overflow. */ if (sizeof(holelen) > sizeof(hlen)) { long long holeend = (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; if (holeend & ~(long long)ULONG_MAX) hlen = ULONG_MAX - hba + 1; } unmap_mapping_pages(mapping, hba, hlen, even_cows); } EXPORT_SYMBOL(unmap_mapping_range); /* * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. * We return with pte unmapped and unlocked. * * We return with the mmap_sem locked or unlocked in the same cases * as does filemap_fault(). */ vm_fault_t do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page = NULL, *swapcache; struct mem_cgroup *memcg; swp_entry_t entry; pte_t pte; int locked; int exclusive = 0; vm_fault_t ret = 0; if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) goto out; entry = pte_to_swp_entry(vmf->orig_pte); if (unlikely(non_swap_entry(entry))) { if (is_migration_entry(entry)) { migration_entry_wait(vma->vm_mm, vmf->pmd, vmf->address); } else if (is_device_private_entry(entry)) { /* * For un-addressable device memory we call the pgmap * fault handler callback. The callback must migrate * the page back to some CPU accessible page. */ ret = device_private_entry_fault(vma, vmf->address, entry, vmf->flags, vmf->pmd); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else { print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL); ret = VM_FAULT_SIGBUS; } goto out; } delayacct_set_flag(DELAYACCT_PF_SWAPIN); page = lookup_swap_cache(entry, vma, vmf->address); swapcache = page; if (!page) { struct swap_info_struct *si = swp_swap_info(entry); if (si->flags & SWP_SYNCHRONOUS_IO && __swap_count(si, entry) == 1) { /* skip swapcache */ page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); if (page) { __SetPageLocked(page); __SetPageSwapBacked(page); set_page_private(page, entry.val); lru_cache_add_anon(page); swap_readpage(page, true); } } else { page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf); swapcache = page; } if (!page) { /* * Back out if somebody else faulted in this pte * while we released the pte lock. */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (likely(pte_same(*vmf->pte, vmf->orig_pte))) ret = VM_FAULT_OOM; delayacct_clear_flag(DELAYACCT_PF_SWAPIN); goto unlock; } /* Had to read the page from swap area: Major fault */ ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); } else if (PageHWPoison(page)) { /* * hwpoisoned dirty swapcache pages are kept for killing * owner processes (which may be unknown at hwpoison time) */ ret = VM_FAULT_HWPOISON; delayacct_clear_flag(DELAYACCT_PF_SWAPIN); goto out_release; } locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); if (!locked) { ret |= VM_FAULT_RETRY; goto out_release; } /* * Make sure try_to_free_swap or reuse_swap_page or swapoff did not * release the swapcache from under us. The page pin, and pte_same * test below, are not enough to exclude that. Even if it is still * swapcache, we need to check that the page's swap has not changed. */ if (unlikely((!PageSwapCache(page) || page_private(page) != entry.val)) && swapcache) goto out_page; page = ksm_might_need_to_copy(page, vma, vmf->address); if (unlikely(!page)) { ret = VM_FAULT_OOM; page = swapcache; goto out_page; } if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg, false)) { ret = VM_FAULT_OOM; goto out_page; } /* * Back out if somebody else already faulted in this pte. */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) goto out_nomap; if (unlikely(!PageUptodate(page))) { ret = VM_FAULT_SIGBUS; goto out_nomap; } /* * The page isn't present yet, go ahead with the fault. * * Be careful about the sequence of operations here. * To get its accounting right, reuse_swap_page() must be called * while the page is counted on swap but not yet in mapcount i.e. * before page_add_anon_rmap() and swap_free(); try_to_free_swap() * must be called after the swap_free(), or it will never succeed. */ inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); vmf->flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; exclusive = RMAP_EXCLUSIVE; } flush_icache_page(vma, page); if (pte_swp_soft_dirty(vmf->orig_pte)) pte = pte_mksoft_dirty(pte); set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); vmf->orig_pte = pte; /* ksm created a completely new copy */ if (unlikely(page != swapcache && swapcache)) { page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } else { do_page_add_anon_rmap(page, vma, vmf->address, exclusive); mem_cgroup_commit_charge(page, memcg, true, false); activate_page(page); } swap_free(entry); if (mem_cgroup_swap_full(page) || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) try_to_free_swap(page); unlock_page(page); if (page != swapcache && swapcache) { /* * Hold the lock to avoid the swap entry to be reused * until we take the PT lock for the pte_same() check * (to avoid false positives from pte_same). For * further safety release the lock after the swap_free * so that the swap count won't change under a * parallel locked swapcache. */ unlock_page(swapcache); put_page(swapcache); } if (vmf->flags & FAULT_FLAG_WRITE) { ret |= do_wp_page(vmf); if (ret & VM_FAULT_ERROR) ret &= VM_FAULT_ERROR; goto out; } /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, vmf->address, vmf->pte); unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); out: return ret; out_nomap: mem_cgroup_cancel_charge(page, memcg, false); pte_unmap_unlock(vmf->pte, vmf->ptl); out_page: unlock_page(page); out_release: put_page(page); if (page != swapcache && swapcache) { unlock_page(swapcache); put_page(swapcache); } return ret; } /* * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct mem_cgroup *memcg; struct page *page; vm_fault_t ret = 0; pte_t entry; /* File mapping without ->vm_ops ? */ if (vma->vm_flags & VM_SHARED) return VM_FAULT_SIGBUS; /* * Use pte_alloc() instead of pte_alloc_map(). We can't run * pte_offset_map() on pmds where a huge pmd might be created * from a different thread. * * pte_alloc_map() is safe to use under down_write(mmap_sem) or when * parallel threads are excluded by other means. * * Here we only have down_read(mmap_sem). */ if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address)) return VM_FAULT_OOM; /* See the comment in pte_alloc_one_map() */ if (unlikely(pmd_trans_unstable(vmf->pmd))) return 0; /* Use the zero-page for reads */ if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm)) { entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), vma->vm_page_prot)); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (!pte_none(*vmf->pte)) goto unlock; ret = check_stable_address_space(vma->vm_mm); if (ret) goto unlock; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); return handle_userfault(vmf, VM_UFFD_MISSING); } goto setpte; } /* Allocate our own private page. */ if (unlikely(anon_vma_prepare(vma))) goto oom; page = alloc_zeroed_user_highpage_movable(vma, vmf->address); if (!page) goto oom; if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg, false)) goto oom_free_page; /* * The memory barrier inside __SetPageUptodate makes sure that * preceeding stores to the page contents become visible before * the set_pte_at() write. */ __SetPageUptodate(page); entry = mk_pte(page, vma->vm_page_prot); if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry)); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (!pte_none(*vmf->pte)) goto release; ret = check_stable_address_space(vma->vm_mm); if (ret) goto release; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); mem_cgroup_cancel_charge(page, memcg, false); put_page(page); return handle_userfault(vmf, VM_UFFD_MISSING); } inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); setpte: set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, vmf->address, vmf->pte); unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; release: mem_cgroup_cancel_charge(page, memcg, false); put_page(page); goto unlock; oom_free_page: put_page(page); oom: return VM_FAULT_OOM; } /* * The mmap_sem must have been held on entry, and may have been * released depending on flags and vma->vm_ops->fault() return value. * See filemap_fault() and __lock_page_retry(). */ static vm_fault_t __do_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; vm_fault_t ret; /* * Preallocate pte before we take page_lock because this might lead to * deadlocks for memcg reclaim which waits for pages under writeback: * lock_page(A) * SetPageWriteback(A) * unlock_page(A) * lock_page(B) * lock_page(B) * pte_alloc_pne * shrink_page_list * wait_on_page_writeback(A) * SetPageWriteback(B) * unlock_page(B) * # flush A, B to clear the writeback */ if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) { vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm, vmf->address); if (!vmf->prealloc_pte) return VM_FAULT_OOM; smp_wmb(); /* See comment in __pte_alloc() */ } ret = vma->vm_ops->fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | VM_FAULT_DONE_COW))) return ret; if (unlikely(PageHWPoison(vmf->page))) { struct page *page = vmf->page; vm_fault_t poisonret = VM_FAULT_HWPOISON; if (ret & VM_FAULT_LOCKED) { if (page_mapped(page)) unmap_mapping_pages(page_mapping(page), page->index, 1, false); /* Retry if a clean page was removed from the cache. */ if (invalidate_inode_page(page)) poisonret = VM_FAULT_NOPAGE; unlock_page(page); } put_page(page); vmf->page = NULL; return poisonret; } if (unlikely(!(ret & VM_FAULT_LOCKED))) lock_page(vmf->page); else VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page); return ret; } /* * The ordering of these checks is important for pmds with _PAGE_DEVMAP set. * If we check pmd_trans_unstable() first we will trip the bad_pmd() check * inside of pmd_none_or_trans_huge_or_clear_bad(). This will end up correctly * returning 1 but not before it spams dmesg with the pmd_clear_bad() output. */ static int pmd_devmap_trans_unstable(pmd_t *pmd) { return pmd_devmap(*pmd) || pmd_trans_unstable(pmd); } static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; if (!pmd_none(*vmf->pmd)) goto map_pte; if (vmf->prealloc_pte) { vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); if (unlikely(!pmd_none(*vmf->pmd))) { spin_unlock(vmf->ptl); goto map_pte; } mm_inc_nr_ptes(vma->vm_mm); pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); spin_unlock(vmf->ptl); vmf->prealloc_pte = NULL; } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) { return VM_FAULT_OOM; } map_pte: /* * If a huge pmd materialized under us just retry later. Use * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead of * pmd_trans_huge() to ensure the pmd didn't become pmd_trans_huge * under us and then back to pmd_none, as a result of MADV_DONTNEED * running immediately after a huge pmd fault in a different thread of * this mm, in turn leading to a misleading pmd_trans_huge() retval. * All we have to ensure is that it is a regular pmd that we can walk * with pte_offset_map() and we can do that through an atomic read in * C, which is what pmd_trans_unstable() provides. */ if (pmd_devmap_trans_unstable(vmf->pmd)) return VM_FAULT_NOPAGE; /* * At this point we know that our vmf->pmd points to a page of ptes * and it cannot become pmd_none(), pmd_devmap() or pmd_trans_huge() * for the duration of the fault. If a racing MADV_DONTNEED runs and * we zap the ptes pointed to by our vmf->pmd, the vmf->ptl will still * be valid and we will re-check to make sure the vmf->pte isn't * pte_none() under vmf->ptl protection when we return to * alloc_set_pte(). */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); return 0; } #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE #define HPAGE_CACHE_INDEX_MASK (HPAGE_PMD_NR - 1) static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, unsigned long haddr) { if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CACHE_INDEX_MASK) != (vma->vm_pgoff & HPAGE_CACHE_INDEX_MASK)) return false; if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) return false; return true; } static void deposit_prealloc_pte(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); /* * We are going to consume the prealloc table, * count that as nr_ptes. */ mm_inc_nr_ptes(vma->vm_mm); vmf->prealloc_pte = NULL; } static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; pmd_t entry; int i; vm_fault_t ret; if (!transhuge_vma_suitable(vma, haddr)) return VM_FAULT_FALLBACK; ret = VM_FAULT_FALLBACK; page = compound_head(page); /* * Archs like ppc64 need additonal space to store information * related to pte entry. Use the preallocated table for that. */ if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) { vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address); if (!vmf->prealloc_pte) return VM_FAULT_OOM; smp_wmb(); /* See comment in __pte_alloc() */ } vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); if (unlikely(!pmd_none(*vmf->pmd))) goto out; for (i = 0; i < HPAGE_PMD_NR; i++) flush_icache_page(vma, page + i); entry = mk_huge_pmd(page, vma->vm_page_prot); if (write) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR); page_add_file_rmap(page, true); /* * deposit and withdraw with pmd lock held */ if (arch_needs_pgtable_deposit()) deposit_prealloc_pte(vmf); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); update_mmu_cache_pmd(vma, haddr, vmf->pmd); /* fault is handled */ ret = 0; count_vm_event(THP_FILE_MAPPED); out: spin_unlock(vmf->ptl); return ret; } #else static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) { BUILD_BUG(); return 0; } #endif /** * alloc_set_pte - setup new PTE entry for given page and add reverse page * mapping. If needed, the fucntion allocates page table or use pre-allocated. * * @vmf: fault environment * @memcg: memcg to charge page (only for private mappings) * @page: page to map * * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on * return. * * Target users are page handler itself and implementations of * vm_ops->map_pages. */ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, struct page *page) { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; pte_t entry; vm_fault_t ret; if (pmd_none(*vmf->pmd) && PageTransCompound(page) && IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { /* THP on COW? */ VM_BUG_ON_PAGE(memcg, page); ret = do_set_pmd(vmf, page); if (ret != VM_FAULT_FALLBACK) return ret; } if (!vmf->pte) { ret = pte_alloc_one_map(vmf); if (ret) return ret; } /* Re-check under ptl */ if (unlikely(!pte_none(*vmf->pte))) return VM_FAULT_NOPAGE; flush_icache_page(vma, page); entry = mk_pte(page, vma->vm_page_prot); if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page, false); } set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); /* no need to invalidate: a not-present page won't be cached */ update_mmu_cache(vma, vmf->address, vmf->pte); return 0; } /** * finish_fault - finish page fault once we have prepared the page to fault * * @vmf: structure describing the fault * * This function handles all that is needed to finish a page fault once the * page to fault in is prepared. It handles locking of PTEs, inserts PTE for * given page, adds reverse page mapping, handles memcg charges and LRU * addition. The function returns 0 on success, VM_FAULT_ code in case of * error. * * The function expects the page to be locked and on success it consumes a * reference of a page being mapped (for the PTE which maps it). */ vm_fault_t finish_fault(struct vm_fault *vmf) { struct page *page; vm_fault_t ret = 0; /* Did we COW the page? */ if ((vmf->flags & FAULT_FLAG_WRITE) && !(vmf->vma->vm_flags & VM_SHARED)) page = vmf->cow_page; else page = vmf->page; /* * check even for read faults because we might have lost our CoWed * page */ if (!(vmf->vma->vm_flags & VM_SHARED)) ret = check_stable_address_space(vmf->vma->vm_mm); if (!ret) ret = alloc_set_pte(vmf, vmf->memcg, page); if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; } static unsigned long fault_around_bytes __read_mostly = rounddown_pow_of_two(65536); #ifdef CONFIG_DEBUG_FS static int fault_around_bytes_get(void *data, u64 *val) { *val = fault_around_bytes; return 0; } /* * fault_around_bytes must be rounded down to the nearest page order as it's * what do_fault_around() expects to see. */ static int fault_around_bytes_set(void *data, u64 val) { if (val / PAGE_SIZE > PTRS_PER_PTE) return -EINVAL; if (val > PAGE_SIZE) fault_around_bytes = rounddown_pow_of_two(val); else fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */ return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops, fault_around_bytes_get, fault_around_bytes_set, "%llu\n"); static int __init fault_around_debugfs(void) { void *ret; ret = debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL, &fault_around_bytes_fops); if (!ret) pr_warn("Failed to create fault_around_bytes in debugfs"); return 0; } late_initcall(fault_around_debugfs); #endif /* * do_fault_around() tries to map few pages around the fault address. The hope * is that the pages will be needed soon and this will lower the number of * faults to handle. * * It uses vm_ops->map_pages() to map the pages, which skips the page if it's * not ready to be mapped: not up-to-date, locked, etc. * * This function is called with the page table lock taken. In the split ptlock * case the page table lock only protects only those entries which belong to * the page table corresponding to the fault address. * * This function doesn't cross the VMA boundaries, in order to call map_pages() * only once. * * fault_around_bytes defines how many bytes we'll try to map. * do_fault_around() expects it to be set to a power of two less than or equal * to PTRS_PER_PTE. * * The virtual address of the area that we map is naturally aligned to * fault_around_bytes rounded down to the machine page size * (and therefore to page order). This way it's easier to guarantee * that we don't cross page table boundaries. */ static vm_fault_t do_fault_around(struct vm_fault *vmf) { unsigned long address = vmf->address, nr_pages, mask; pgoff_t start_pgoff = vmf->pgoff; pgoff_t end_pgoff; int off; vm_fault_t ret = 0; nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; vmf->address = max(address & mask, vmf->vma->vm_start); off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); start_pgoff -= off; /* * end_pgoff is either the end of the page table, the end of * the vma or nr_pages from start_pgoff, depending what is nearest. */ end_pgoff = start_pgoff - ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + PTRS_PER_PTE - 1; end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1, start_pgoff + nr_pages - 1); if (pmd_none(*vmf->pmd)) { vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm, vmf->address); if (!vmf->prealloc_pte) goto out; smp_wmb(); /* See comment in __pte_alloc() */ } vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff); /* Huge page is mapped? Page fault is solved */ if (pmd_trans_huge(*vmf->pmd)) { ret = VM_FAULT_NOPAGE; goto out; } /* ->map_pages() haven't done anything useful. Cold page cache? */ if (!vmf->pte) goto out; /* check if the page fault is solved */ vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT); if (!pte_none(*vmf->pte)) ret = VM_FAULT_NOPAGE; pte_unmap_unlock(vmf->pte, vmf->ptl); out: vmf->address = address; vmf->pte = NULL; return ret; } static vm_fault_t do_read_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; vm_fault_t ret = 0; /* * Let's call ->map_pages() first and use ->fault() as fallback * if page by the offset is not ready to be mapped (cold cache or * something). */ if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { ret = do_fault_around(vmf); if (ret) return ret; } ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; ret |= finish_fault(vmf); unlock_page(vmf->page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) put_page(vmf->page); return ret; } static vm_fault_t do_cow_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; vm_fault_t ret; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); if (!vmf->cow_page) return VM_FAULT_OOM; if (mem_cgroup_try_charge_delay(vmf->cow_page, vma->vm_mm, GFP_KERNEL, &vmf->memcg, false)) { put_page(vmf->cow_page); return VM_FAULT_OOM; } ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; if (ret & VM_FAULT_DONE_COW) return ret; copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma); __SetPageUptodate(vmf->cow_page); ret |= finish_fault(vmf); unlock_page(vmf->page); put_page(vmf->page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; return ret; uncharge_out: mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false); put_page(vmf->cow_page); return ret; } static vm_fault_t do_shared_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; vm_fault_t ret, tmp; ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; /* * Check if the backing address space wants to know that the page is * about to become writable */ if (vma->vm_ops->page_mkwrite) { unlock_page(vmf->page); tmp = do_page_mkwrite(vmf); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { put_page(vmf->page); return tmp; } } ret |= finish_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) { unlock_page(vmf->page); put_page(vmf->page); return ret; } fault_dirty_shared_page(vma, vmf->page); return ret; } /* * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults). * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). * If mmap_sem is released, vma may become invalid (for example * by other thread calling munmap()). */ static vm_fault_t do_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct mm_struct *vm_mm = vma->vm_mm; vm_fault_t ret; /* * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ if (!vma->vm_ops->fault) { /* * If we find a migration pmd entry or a none pmd entry, which * should never happen, return SIGBUS */ if (unlikely(!pmd_present(*vmf->pmd))) ret = VM_FAULT_SIGBUS; else { vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); /* * Make sure this is not a temporary clearing of pte * by holding ptl and checking again. A R/M/W update * of pte involves: take ptl, clearing the pte so that * we don't have concurrent modification by hardware * followed by an update. */ if (unlikely(pte_none(*vmf->pte))) ret = VM_FAULT_SIGBUS; else ret = VM_FAULT_NOPAGE; pte_unmap_unlock(vmf->pte, vmf->ptl); } } else if (!(vmf->flags & FAULT_FLAG_WRITE)) ret = do_read_fault(vmf); else if (!(vma->vm_flags & VM_SHARED)) ret = do_cow_fault(vmf); else ret = do_shared_fault(vmf); /* preallocated pagetable is unused: free it */ if (vmf->prealloc_pte) { pte_free(vm_mm, vmf->prealloc_pte); vmf->prealloc_pte = NULL; } return ret; } static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, unsigned long addr, int page_nid, int *flags) { get_page(page); count_vm_numa_event(NUMA_HINT_FAULTS); if (page_nid == numa_node_id()) { count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); *flags |= TNF_FAULT_LOCAL; } return mpol_misplaced(page, vma, addr); } static vm_fault_t do_numa_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page = NULL; int page_nid = -1; int last_cpupid; int target_nid; bool migrated = false; pte_t pte; bool was_writable = pte_savedwrite(vmf->orig_pte); int flags = 0; /* * The "pte" at this point cannot be used safely without * validation through pte_unmap_same(). It's of NUMA type but * the pfn may be screwed if the read is non atomic. */ vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd); spin_lock(vmf->ptl); if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; } /* * Make it present again, Depending on how arch implementes non * accessible ptes, some can allow access by kernel mode. */ pte = ptep_modify_prot_start(vma->vm_mm, vmf->address, vmf->pte); pte = pte_modify(pte, vma->vm_page_prot); pte = pte_mkyoung(pte); if (was_writable) pte = pte_mkwrite(pte); ptep_modify_prot_commit(vma->vm_mm, vmf->address, vmf->pte, pte); update_mmu_cache(vma, vmf->address, vmf->pte); page = vm_normal_page(vma, vmf->address, pte); if (!page) { pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } /* TODO: handle PTE-mapped THP */ if (PageCompound(page)) { pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } /* * Avoid grouping on RO pages in general. RO pages shouldn't hurt as * much anyway since they can be in shared cache state. This misses * the case where a mapping is writable but the process never writes * to it but pte_write gets cleared during protection updates and * pte_dirty has unpredictable behaviour between PTE scan updates, * background writeback, dirty balancing and application behaviour. */ if (!pte_write(pte)) flags |= TNF_NO_GROUP; /* * Flag if the page is shared between multiple address spaces. This * is later used when determining whether to group tasks together */ if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED)) flags |= TNF_SHARED; last_cpupid = page_cpupid_last(page); page_nid = page_to_nid(page); target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid, &flags); pte_unmap_unlock(vmf->pte, vmf->ptl); if (target_nid == -1) { put_page(page); goto out; } /* Migrate to the requested node */ migrated = migrate_misplaced_page(page, vma, target_nid); if (migrated) { page_nid = target_nid; flags |= TNF_MIGRATED; } else flags |= TNF_MIGRATE_FAIL; out: if (page_nid != -1) task_numa_fault(last_cpupid, page_nid, 1, flags); return 0; } static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) { if (vma_is_anonymous(vmf->vma)) return do_huge_pmd_anonymous_page(vmf); if (vmf->vma->vm_ops->huge_fault) return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); return VM_FAULT_FALLBACK; } /* `inline' is required to avoid gcc 4.1.2 build error */ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) { if (vma_is_anonymous(vmf->vma)) return do_huge_pmd_wp_page(vmf, orig_pmd); if (vmf->vma->vm_ops->huge_fault) return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); /* COW handled on pte level: split pmd */ VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma); __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL); return VM_FAULT_FALLBACK; } static inline bool vma_is_accessible(struct vm_area_struct *vma) { return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE); } static vm_fault_t create_huge_pud(struct vm_fault *vmf) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* No support for anonymous transparent PUD pages yet */ if (vma_is_anonymous(vmf->vma)) return VM_FAULT_FALLBACK; if (vmf->vma->vm_ops->huge_fault) return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ return VM_FAULT_FALLBACK; } static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* No support for anonymous transparent PUD pages yet */ if (vma_is_anonymous(vmf->vma)) return VM_FAULT_FALLBACK; if (vmf->vma->vm_ops->huge_fault) return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ return VM_FAULT_FALLBACK; } /* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most * RISC architectures). The early dirtying is also good on the i386. * * There is also a hook called "update_mmu_cache()" that architectures * with external mmu caches can use to update those (ie the Sparc or * PowerPC hashed page tables that act as extended TLBs). * * We enter with non-exclusive mmap_sem (to exclude vma changes, but allow * concurrent faults). * * The mmap_sem may have been released depending on flags and our return value. * See filemap_fault() and __lock_page_or_retry(). */ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) { pte_t entry; if (unlikely(pmd_none(*vmf->pmd))) { /* * Leave __pte_alloc() until later: because vm_ops->fault may * want to allocate huge page, and if we expose page table * for an instant, it will be difficult to retract from * concurrent faults and from rmap lookups. */ vmf->pte = NULL; } else { /* See comment in pte_alloc_one_map() */ if (pmd_devmap_trans_unstable(vmf->pmd)) return 0; /* * A regular pmd is established and it can't morph into a huge * pmd from under us anymore at this point because we hold the * mmap_sem read mode and khugepaged takes it in write mode. * So now it's safe to run pte_offset_map(). */ vmf->pte = pte_offset_map(vmf->pmd, vmf->address); vmf->orig_pte = *vmf->pte; /* * some architectures can have larger ptes than wordsize, * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and * CONFIG_32BIT=y, so READ_ONCE cannot guarantee atomic * accesses. The code below just needs a consistent view * for the ifs and we later double check anyway with the * ptl lock held. So here a barrier will do. */ barrier(); if (pte_none(vmf->orig_pte)) { pte_unmap(vmf->pte); vmf->pte = NULL; } } if (!vmf->pte) { if (vma_is_anonymous(vmf->vma)) return do_anonymous_page(vmf); else return do_fault(vmf); } if (!pte_present(vmf->orig_pte)) return do_swap_page(vmf); if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) return do_numa_page(vmf); vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); spin_lock(vmf->ptl); entry = vmf->orig_pte; if (unlikely(!pte_same(*vmf->pte, entry))) goto unlock; if (vmf->flags & FAULT_FLAG_WRITE) { if (!pte_write(entry)) return do_wp_page(vmf); entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry, vmf->flags & FAULT_FLAG_WRITE)) { update_mmu_cache(vmf->vma, vmf->address, vmf->pte); } else { /* * This is needed only for protection faults but the arch code * is not yet telling us if this is a protection fault or not. * This still avoids useless tlb flushes for .text page faults * with threads. */ if (vmf->flags & FAULT_FLAG_WRITE) flush_tlb_fix_spurious_fault(vmf->vma, vmf->address); } unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } /* * By the time we get here, we already hold the mm semaphore * * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). */ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags) { struct vm_fault vmf = { .vma = vma, .address = address & PAGE_MASK, .flags = flags, .pgoff = linear_page_index(vma, address), .gfp_mask = __get_fault_gfp_mask(vma), }; unsigned int dirty = flags & FAULT_FLAG_WRITE; struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; p4d_t *p4d; vm_fault_t ret; pgd = pgd_offset(mm, address); p4d = p4d_alloc(mm, pgd, address); if (!p4d) return VM_FAULT_OOM; vmf.pud = pud_alloc(mm, p4d, address); if (!vmf.pud) return VM_FAULT_OOM; if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { pud_t orig_pud = *vmf.pud; barrier(); if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) { /* NUMA case for anonymous PUDs would go here */ if (dirty && !pud_write(orig_pud)) { ret = wp_huge_pud(&vmf, orig_pud); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { huge_pud_set_accessed(&vmf, orig_pud); return 0; } } } vmf.pmd = pmd_alloc(mm, vmf.pud, address); if (!vmf.pmd) return VM_FAULT_OOM; if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { pmd_t orig_pmd = *vmf.pmd; barrier(); if (unlikely(is_swap_pmd(orig_pmd))) { VM_BUG_ON(thp_migration_supported() && !is_pmd_migration_entry(orig_pmd)); if (is_pmd_migration_entry(orig_pmd)) pmd_migration_entry_wait(mm, vmf.pmd); return 0; } if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) return do_huge_pmd_numa_page(&vmf, orig_pmd); if (dirty && !pmd_write(orig_pmd)) { ret = wp_huge_pmd(&vmf, orig_pmd); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { huge_pmd_set_accessed(&vmf, orig_pmd); return 0; } } } return handle_pte_fault(&vmf); } /* * By the time we get here, we already hold the mm semaphore * * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). */ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags) { vm_fault_t ret; __set_current_state(TASK_RUNNING); count_vm_event(PGFAULT); count_memcg_event_mm(vma->vm_mm, PGFAULT); /* do counter updates before entering really critical section. */ check_sync_rss_stat(current); if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, flags & FAULT_FLAG_INSTRUCTION, flags & FAULT_FLAG_REMOTE)) return VM_FAULT_SIGSEGV; /* * Enable the memcg OOM handling for faults triggered in user * space. Kernel faults are handled more gracefully. */ if (flags & FAULT_FLAG_USER) mem_cgroup_enter_user_fault(); if (unlikely(is_vm_hugetlb_page(vma))) ret = hugetlb_fault(vma->vm_mm, vma, address, flags); else ret = __handle_mm_fault(vma, address, flags); if (flags & FAULT_FLAG_USER) { mem_cgroup_exit_user_fault(); /* * The task may have entered a memcg OOM situation but * if the allocation error was handled gracefully (no * VM_FAULT_OOM), there is no need to kill anything. * Just clean up the OOM state peacefully. */ if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)) mem_cgroup_oom_synchronize(false); } return ret; } EXPORT_SYMBOL_GPL(handle_mm_fault); #ifndef __PAGETABLE_P4D_FOLDED /* * Allocate p4d page table. * We've already handled the fast-path in-line. */ int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { p4d_t *new = p4d_alloc_one(mm, address); if (!new) return -ENOMEM; smp_wmb(); /* See comment in __pte_alloc */ spin_lock(&mm->page_table_lock); if (pgd_present(*pgd)) /* Another has populated it */ p4d_free(mm, new); else pgd_populate(mm, pgd, new); spin_unlock(&mm->page_table_lock); return 0; } #endif /* __PAGETABLE_P4D_FOLDED */ #ifndef __PAGETABLE_PUD_FOLDED /* * Allocate page upper directory. * We've already handled the fast-path in-line. */ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) { pud_t *new = pud_alloc_one(mm, address); if (!new) return -ENOMEM; smp_wmb(); /* See comment in __pte_alloc */ spin_lock(&mm->page_table_lock); #ifndef __ARCH_HAS_5LEVEL_HACK if (!p4d_present(*p4d)) { mm_inc_nr_puds(mm); p4d_populate(mm, p4d, new); } else /* Another has populated it */ pud_free(mm, new); #else if (!pgd_present(*p4d)) { mm_inc_nr_puds(mm); pgd_populate(mm, p4d, new); } else /* Another has populated it */ pud_free(mm, new); #endif /* __ARCH_HAS_5LEVEL_HACK */ spin_unlock(&mm->page_table_lock); return 0; } #endif /* __PAGETABLE_PUD_FOLDED */ #ifndef __PAGETABLE_PMD_FOLDED /* * Allocate page middle directory. * We've already handled the fast-path in-line. */ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { spinlock_t *ptl; pmd_t *new = pmd_alloc_one(mm, address); if (!new) return -ENOMEM; smp_wmb(); /* See comment in __pte_alloc */ ptl = pud_lock(mm, pud); #ifndef __ARCH_HAS_4LEVEL_HACK if (!pud_present(*pud)) { mm_inc_nr_pmds(mm); pud_populate(mm, pud, new); } else /* Another has populated it */ pmd_free(mm, new); #else if (!pgd_present(*pud)) { mm_inc_nr_pmds(mm); pgd_populate(mm, pud, new); } else /* Another has populated it */ pmd_free(mm, new); #endif /* __ARCH_HAS_4LEVEL_HACK */ spin_unlock(ptl); return 0; } #endif /* __PAGETABLE_PMD_FOLDED */ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, unsigned long *start, unsigned long *end, pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *ptep; pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) goto out; p4d = p4d_offset(pgd, address); if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d))) goto out; pud = pud_offset(p4d, address); if (pud_none(*pud) || unlikely(pud_bad(*pud))) goto out; pmd = pmd_offset(pud, address); VM_BUG_ON(pmd_trans_huge(*pmd)); if (pmd_huge(*pmd)) { if (!pmdpp) goto out; if (start && end) { *start = address & PMD_MASK; *end = *start + PMD_SIZE; mmu_notifier_invalidate_range_start(mm, *start, *end); } *ptlp = pmd_lock(mm, pmd); if (pmd_huge(*pmd)) { *pmdpp = pmd; return 0; } spin_unlock(*ptlp); if (start && end) mmu_notifier_invalidate_range_end(mm, *start, *end); } if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) goto out; if (start && end) { *start = address & PAGE_MASK; *end = *start + PAGE_SIZE; mmu_notifier_invalidate_range_start(mm, *start, *end); } ptep = pte_offset_map_lock(mm, pmd, address, ptlp); if (!pte_present(*ptep)) goto unlock; *ptepp = ptep; return 0; unlock: pte_unmap_unlock(ptep, *ptlp); if (start && end) mmu_notifier_invalidate_range_end(mm, *start, *end); out: return -EINVAL; } static inline int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, spinlock_t **ptlp) { int res; /* (void) is needed to make gcc happy */ (void) __cond_lock(*ptlp, !(res = __follow_pte_pmd(mm, address, NULL, NULL, ptepp, NULL, ptlp))); return res; } int follow_pte_pmd(struct mm_struct *mm, unsigned long address, unsigned long *start, unsigned long *end, pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) { int res; /* (void) is needed to make gcc happy */ (void) __cond_lock(*ptlp, !(res = __follow_pte_pmd(mm, address, start, end, ptepp, pmdpp, ptlp))); return res; } EXPORT_SYMBOL(follow_pte_pmd); /** * follow_pfn - look up PFN at a user virtual address * @vma: memory mapping * @address: user virtual address * @pfn: location to store found PFN * * Only IO mappings and raw PFN mappings are allowed. * * Returns zero and the pfn at @pfn on success, -ve otherwise. */ int follow_pfn(struct vm_area_struct *vma, unsigned long address, unsigned long *pfn) { int ret = -EINVAL; spinlock_t *ptl; pte_t *ptep; if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) return ret; ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); if (ret) return ret; *pfn = pte_pfn(*ptep); pte_unmap_unlock(ptep, ptl); return 0; } EXPORT_SYMBOL(follow_pfn); #ifdef CONFIG_HAVE_IOREMAP_PROT int follow_phys(struct vm_area_struct *vma, unsigned long address, unsigned int flags, unsigned long *prot, resource_size_t *phys) { int ret = -EINVAL; pte_t *ptep, pte; spinlock_t *ptl; if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) goto out; if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) goto out; pte = *ptep; if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; *prot = pgprot_val(pte_pgprot(pte)); *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; ret = 0; unlock: pte_unmap_unlock(ptep, ptl); out: return ret; } int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write) { resource_size_t phys_addr; unsigned long prot = 0; void __iomem *maddr; int offset = addr & (PAGE_SIZE-1); if (follow_phys(vma, addr, write, &prot, &phys_addr)) return -EINVAL; maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot); if (!maddr) return -ENOMEM; if (write) memcpy_toio(maddr + offset, buf, len); else memcpy_fromio(buf, maddr + offset, len); iounmap(maddr); return len; } EXPORT_SYMBOL_GPL(generic_access_phys); #endif /* * Access another process' address space as given in mm. If non-NULL, use the * given task for page fault accounting. */ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) { struct vm_area_struct *vma; void *old_buf = buf; int write = gup_flags & FOLL_WRITE; if (down_read_killable(&mm->mmap_sem)) return 0; /* ignore errors, just check how much was successfully transferred */ while (len) { int bytes, ret, offset; void *maddr; struct page *page = NULL; ret = get_user_pages_remote(tsk, mm, addr, 1, gup_flags, &page, &vma, NULL); if (ret <= 0) { #ifndef CONFIG_HAVE_IOREMAP_PROT break; #else /* * Check if this is a VM_IO | VM_PFNMAP VMA, which * we can access using slightly different code. */ vma = find_vma(mm, addr); if (!vma || vma->vm_start > addr) break; if (vma->vm_ops && vma->vm_ops->access) ret = vma->vm_ops->access(vma, addr, buf, len, write); if (ret <= 0) break; bytes = ret; #endif } else { bytes = len; offset = addr & (PAGE_SIZE-1); if (bytes > PAGE_SIZE-offset) bytes = PAGE_SIZE-offset; maddr = kmap(page); if (write) { copy_to_user_page(vma, page, addr, maddr + offset, buf, bytes); set_page_dirty_lock(page); } else { copy_from_user_page(vma, page, addr, buf, maddr + offset, bytes); } kunmap(page); put_page(page); } len -= bytes; buf += bytes; addr += bytes; } up_read(&mm->mmap_sem); return buf - old_buf; } /** * access_remote_vm - access another process' address space * @mm: the mm_struct of the target address space * @addr: start address to access * @buf: source or destination buffer * @len: number of bytes to transfer * @gup_flags: flags modifying lookup behaviour * * The caller must hold a reference on @mm. */ int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) { return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags); } /* * Access another process' address space. * Source/target buffer must be kernel space, * Do not walk the page table directly, use get_user_pages */ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags) { struct mm_struct *mm; int ret; mm = get_task_mm(tsk); if (!mm) return 0; ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags); mmput(mm); return ret; } EXPORT_SYMBOL_GPL(access_process_vm); /* * Print the name of a VMA. */ void print_vma_addr(char *prefix, unsigned long ip) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; /* * we might be running from an atomic context so we cannot sleep */ if (!down_read_trylock(&mm->mmap_sem)) return; vma = find_vma(mm, ip); if (vma && vma->vm_file) { struct file *f = vma->vm_file; char *buf = (char *)__get_free_page(GFP_NOWAIT); if (buf) { char *p; p = file_path(f, buf, PAGE_SIZE); if (IS_ERR(p)) p = "?"; printk("%s%s[%lx+%lx]", prefix, kbasename(p), vma->vm_start, vma->vm_end - vma->vm_start); free_page((unsigned long)buf); } } up_read(&mm->mmap_sem); } #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP) void __might_fault(const char *file, int line) { /* * Some code (nfs/sunrpc) uses socket ops on kernel memory while * holding the mmap_sem, this is safe because kernel memory doesn't * get paged out, therefore we'll never actually fault, and the * below annotations will generate false positives. */ if (uaccess_kernel()) return; if (pagefault_disabled()) return; __might_sleep(file, line, 0); #if defined(CONFIG_DEBUG_ATOMIC_SLEEP) if (current->mm) might_lock_read(¤t->mm->mmap_sem); #endif } EXPORT_SYMBOL(__might_fault); #endif #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) /* * Process all subpages of the specified huge page with the specified * operation. The target subpage will be processed last to keep its * cache lines hot. */ static inline void process_huge_page( unsigned long addr_hint, unsigned int pages_per_huge_page, void (*process_subpage)(unsigned long addr, int idx, void *arg), void *arg) { int i, n, base, l; unsigned long addr = addr_hint & ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1); /* Process target subpage last to keep its cache lines hot */ might_sleep(); n = (addr_hint - addr) / PAGE_SIZE; if (2 * n <= pages_per_huge_page) { /* If target subpage in first half of huge page */ base = 0; l = n; /* Process subpages at the end of huge page */ for (i = pages_per_huge_page - 1; i >= 2 * n; i--) { cond_resched(); process_subpage(addr + i * PAGE_SIZE, i, arg); } } else { /* If target subpage in second half of huge page */ base = pages_per_huge_page - 2 * (pages_per_huge_page - n); l = pages_per_huge_page - n; /* Process subpages at the begin of huge page */ for (i = 0; i < base; i++) { cond_resched(); process_subpage(addr + i * PAGE_SIZE, i, arg); } } /* * Process remaining subpages in left-right-left-right pattern * towards the target subpage */ for (i = 0; i < l; i++) { int left_idx = base + i; int right_idx = base + 2 * l - 1 - i; cond_resched(); process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg); cond_resched(); process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg); } } static void clear_gigantic_page(struct page *page, unsigned long addr, unsigned int pages_per_huge_page) { int i; struct page *p = page; might_sleep(); for (i = 0; i < pages_per_huge_page; i++, p = mem_map_next(p, page, i)) { cond_resched(); clear_user_highpage(p, addr + i * PAGE_SIZE); } } static void clear_subpage(unsigned long addr, int idx, void *arg) { struct page *page = arg; clear_user_highpage(page + idx, addr); } void clear_huge_page(struct page *page, unsigned long addr_hint, unsigned int pages_per_huge_page) { unsigned long addr = addr_hint & ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1); if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { clear_gigantic_page(page, addr, pages_per_huge_page); return; } process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page); } static void copy_user_gigantic_page(struct page *dst, struct page *src, unsigned long addr, struct vm_area_struct *vma, unsigned int pages_per_huge_page) { int i; struct page *dst_base = dst; struct page *src_base = src; for (i = 0; i < pages_per_huge_page; ) { cond_resched(); copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); i++; dst = mem_map_next(dst, dst_base, i); src = mem_map_next(src, src_base, i); } } struct copy_subpage_arg { struct page *dst; struct page *src; struct vm_area_struct *vma; }; static void copy_subpage(unsigned long addr, int idx, void *arg) { struct copy_subpage_arg *copy_arg = arg; copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx, addr, copy_arg->vma); } void copy_user_huge_page(struct page *dst, struct page *src, unsigned long addr_hint, struct vm_area_struct *vma, unsigned int pages_per_huge_page) { unsigned long addr = addr_hint & ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1); struct copy_subpage_arg arg = { .dst = dst, .src = src, .vma = vma, }; if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { copy_user_gigantic_page(dst, src, addr, vma, pages_per_huge_page); return; } process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg); } long copy_huge_page_from_user(struct page *dst_page, const void __user *usr_src, unsigned int pages_per_huge_page, bool allow_pagefault) { void *src = (void *)usr_src; void *page_kaddr; unsigned long i, rc = 0; unsigned long ret_val = pages_per_huge_page * PAGE_SIZE; struct page *subpage = dst_page; for (i = 0; i < pages_per_huge_page; i++, subpage = mem_map_next(subpage, dst_page, i)) { if (allow_pagefault) page_kaddr = kmap(subpage); else page_kaddr = kmap_atomic(subpage); rc = copy_from_user(page_kaddr, (const void __user *)(src + i * PAGE_SIZE), PAGE_SIZE); if (allow_pagefault) kunmap(subpage); else kunmap_atomic(page_kaddr); ret_val -= (PAGE_SIZE - rc); if (rc) break; flush_dcache_page(subpage); cond_resched(); } return ret_val; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ #if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS static struct kmem_cache *page_ptl_cachep; void __init ptlock_cache_init(void) { page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0, SLAB_PANIC, NULL); } bool ptlock_alloc(struct page *page) { spinlock_t *ptl; ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL); if (!ptl) return false; page->ptl = ptl; return true; } void ptlock_free(struct page *page) { kmem_cache_free(page_ptl_cachep, page->ptl); } #endif |
73 73 56 56 56 27 17 264 263 142 42 264 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 | // SPDX-License-Identifier: GPL-2.0 /* * event tracer * * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com> * * - Added format output of fields of the trace point. * This was based off of work by Tom Zanussi <tzanussi@gmail.com>. * */ #define pr_fmt(fmt) fmt #include <linux/workqueue.h> #include <linux/spinlock.h> #include <linux/kthread.h> #include <linux/tracefs.h> #include <linux/uaccess.h> #include <linux/module.h> #include <linux/ctype.h> #include <linux/sort.h> #include <linux/slab.h> #include <linux/delay.h> #include <trace/events/sched.h> #include <asm/setup.h> #include "trace_output.h" #undef TRACE_SYSTEM #define TRACE_SYSTEM "TRACE_SYSTEM" DEFINE_MUTEX(event_mutex); LIST_HEAD(ftrace_events); static LIST_HEAD(ftrace_generic_fields); static LIST_HEAD(ftrace_common_fields); #define GFP_TRACE (GFP_KERNEL | __GFP_ZERO) static struct kmem_cache *field_cachep; static struct kmem_cache *file_cachep; static inline int system_refcount(struct event_subsystem *system) { return system->ref_count; } static int system_refcount_inc(struct event_subsystem *system) { return system->ref_count++; } static int system_refcount_dec(struct event_subsystem *system) { return --system->ref_count; } /* Double loops, do not use break, only goto's work */ #define do_for_each_event_file(tr, file) \ list_for_each_entry(tr, &ftrace_trace_arrays, list) { \ list_for_each_entry(file, &tr->events, list) #define do_for_each_event_file_safe(tr, file) \ list_for_each_entry(tr, &ftrace_trace_arrays, list) { \ struct trace_event_file *___n; \ list_for_each_entry_safe(file, ___n, &tr->events, list) #define while_for_each_event_file() \ } static struct list_head * trace_get_fields(struct trace_event_call *event_call) { if (!event_call->class->get_fields) return &event_call->class->fields; return event_call->class->get_fields(event_call); } static struct ftrace_event_field * __find_event_field(struct list_head *head, char *name) { struct ftrace_event_field *field; list_for_each_entry(field, head, link) { if (!strcmp(field->name, name)) return field; } return NULL; } struct ftrace_event_field * trace_find_event_field(struct trace_event_call *call, char *name) { struct ftrace_event_field *field; struct list_head *head; head = trace_get_fields(call); field = __find_event_field(head, name); if (field) return field; field = __find_event_field(&ftrace_generic_fields, name); if (field) return field; return __find_event_field(&ftrace_common_fields, name); } static int __trace_define_field(struct list_head *head, const char *type, const char *name, int offset, int size, int is_signed, int filter_type) { struct ftrace_event_field *field; field = kmem_cache_alloc(field_cachep, GFP_TRACE); if (!field) return -ENOMEM; field->name = name; field->type = type; if (filter_type == FILTER_OTHER) field->filter_type = filter_assign_type(type); else field->filter_type = filter_type; field->offset = offset; field->size = size; field->is_signed = is_signed; list_add(&field->link, head); return 0; } int trace_define_field(struct trace_event_call *call, const char *type, const char *name, int offset, int size, int is_signed, int filter_type) { struct list_head *head; if (WARN_ON(!call->class)) return 0; head = trace_get_fields(call); return __trace_define_field(head, type, name, offset, size, is_signed, filter_type); } EXPORT_SYMBOL_GPL(trace_define_field); #define __generic_field(type, item, filter_type) \ ret = __trace_define_field(&ftrace_generic_fields, #type, \ #item, 0, 0, is_signed_type(type), \ filter_type); \ if (ret) \ return ret; #define __common_field(type, item) \ ret = __trace_define_field(&ftrace_common_fields, #type, \ "common_" #item, \ offsetof(typeof(ent), item), \ sizeof(ent.item), \ is_signed_type(type), FILTER_OTHER); \ if (ret) \ return ret; static int trace_define_generic_fields(void) { int ret; __generic_field(int, CPU, FILTER_CPU); __generic_field(int, cpu, FILTER_CPU); __generic_field(int, common_cpu, FILTER_CPU); __generic_field(char *, COMM, FILTER_COMM); __generic_field(char *, comm, FILTER_COMM); return ret; } static int trace_define_common_fields(void) { int ret; struct trace_entry ent; __common_field(unsigned short, type); __common_field(unsigned char, flags); __common_field(unsigned char, preempt_count); __common_field(int, pid); return ret; } static void trace_destroy_fields(struct trace_event_call *call) { struct ftrace_event_field *field, *next; struct list_head *head; head = trace_get_fields(call); list_for_each_entry_safe(field, next, head, link) { list_del(&field->link); kmem_cache_free(field_cachep, field); } } /* * run-time version of trace_event_get_offsets_<call>() that returns the last * accessible offset of trace fields excluding __dynamic_array bytes */ int trace_event_get_offsets(struct trace_event_call *call) { struct ftrace_event_field *tail; struct list_head *head; head = trace_get_fields(call); /* * head->next points to the last field with the largest offset, * since it was added last by trace_define_field() */ tail = list_first_entry(head, struct ftrace_event_field, link); return tail->offset + tail->size; } int trace_event_raw_init(struct trace_event_call *call) { int id; id = register_trace_event(&call->event); if (!id) return -ENODEV; return 0; } EXPORT_SYMBOL_GPL(trace_event_raw_init); bool trace_event_ignore_this_pid(struct trace_event_file *trace_file) { struct trace_array *tr = trace_file->tr; struct trace_array_cpu *data; struct trace_pid_list *pid_list; pid_list = rcu_dereference_raw(tr->filtered_pids); if (!pid_list) return false; data = this_cpu_ptr(tr->trace_buffer.data); return data->ignore_pid; } EXPORT_SYMBOL_GPL(trace_event_ignore_this_pid); void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, struct trace_event_file *trace_file, unsigned long len) { struct trace_event_call *event_call = trace_file->event_call; if ((trace_file->flags & EVENT_FILE_FL_PID_FILTER) && trace_event_ignore_this_pid(trace_file)) return NULL; local_save_flags(fbuffer->flags); fbuffer->pc = preempt_count(); /* * If CONFIG_PREEMPT is enabled, then the tracepoint itself disables * preemption (adding one to the preempt_count). Since we are * interested in the preempt_count at the time the tracepoint was * hit, we need to subtract one to offset the increment. */ if (IS_ENABLED(CONFIG_PREEMPT)) fbuffer->pc--; fbuffer->trace_file = trace_file; fbuffer->event = trace_event_buffer_lock_reserve(&fbuffer->buffer, trace_file, event_call->event.type, len, fbuffer->flags, fbuffer->pc); if (!fbuffer->event) return NULL; fbuffer->entry = ring_buffer_event_data(fbuffer->event); return fbuffer->entry; } EXPORT_SYMBOL_GPL(trace_event_buffer_reserve); int trace_event_reg(struct trace_event_call *call, enum trace_reg type, void *data) { struct trace_event_file *file = data; WARN_ON(!(call->flags & TRACE_EVENT_FL_TRACEPOINT)); switch (type) { case TRACE_REG_REGISTER: return tracepoint_probe_register(call->tp, call->class->probe, file); case TRACE_REG_UNREGISTER: tracepoint_probe_unregister(call->tp, call->class->probe, file); return 0; #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: return tracepoint_probe_register(call->tp, call->class->perf_probe, call); case TRACE_REG_PERF_UNREGISTER: tracepoint_probe_unregister(call->tp, call->class->perf_probe, call); return 0; case TRACE_REG_PERF_OPEN: case TRACE_REG_PERF_CLOSE: case TRACE_REG_PERF_ADD: case TRACE_REG_PERF_DEL: return 0; #endif } return 0; } EXPORT_SYMBOL_GPL(trace_event_reg); void trace_event_enable_cmd_record(bool enable) { struct trace_event_file *file; struct trace_array *tr; lockdep_assert_held(&event_mutex); do_for_each_event_file(tr, file) { if (!(file->flags & EVENT_FILE_FL_ENABLED)) continue; if (enable) { tracing_start_cmdline_record(); set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } else { tracing_stop_cmdline_record(); clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } } while_for_each_event_file(); } void trace_event_enable_tgid_record(bool enable) { struct trace_event_file *file; struct trace_array *tr; lockdep_assert_held(&event_mutex); do_for_each_event_file(tr, file) { if (!(file->flags & EVENT_FILE_FL_ENABLED)) continue; if (enable) { tracing_start_tgid_record(); set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); } else { tracing_stop_tgid_record(); clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); } } while_for_each_event_file(); } static int __ftrace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable) { struct trace_event_call *call = file->event_call; struct trace_array *tr = file->tr; unsigned long file_flags = file->flags; int ret = 0; int disable; switch (enable) { case 0: /* * When soft_disable is set and enable is cleared, the sm_ref * reference counter is decremented. If it reaches 0, we want * to clear the SOFT_DISABLED flag but leave the event in the * state that it was. That is, if the event was enabled and * SOFT_DISABLED isn't set, then do nothing. But if SOFT_DISABLED * is set we do not want the event to be enabled before we * clear the bit. * * When soft_disable is not set but the SOFT_MODE flag is, * we do nothing. Do not disable the tracepoint, otherwise * "soft enable"s (clearing the SOFT_DISABLED bit) wont work. */ if (soft_disable) { if (atomic_dec_return(&file->sm_ref) > 0) break; disable = file->flags & EVENT_FILE_FL_SOFT_DISABLED; clear_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags); } else disable = !(file->flags & EVENT_FILE_FL_SOFT_MODE); if (disable && (file->flags & EVENT_FILE_FL_ENABLED)) { clear_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags); if (file->flags & EVENT_FILE_FL_RECORDED_CMD) { tracing_stop_cmdline_record(); clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } if (file->flags & EVENT_FILE_FL_RECORDED_TGID) { tracing_stop_tgid_record(); clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); } call->class->reg(call, TRACE_REG_UNREGISTER, file); } /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */ if (file->flags & EVENT_FILE_FL_SOFT_MODE) set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); else clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); break; case 1: /* * When soft_disable is set and enable is set, we want to * register the tracepoint for the event, but leave the event * as is. That means, if the event was already enabled, we do * nothing (but set SOFT_MODE). If the event is disabled, we * set SOFT_DISABLED before enabling the event tracepoint, so * it still seems to be disabled. */ if (!soft_disable) clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); else { if (atomic_inc_return(&file->sm_ref) > 1) break; set_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags); } if (!(file->flags & EVENT_FILE_FL_ENABLED)) { bool cmd = false, tgid = false; /* Keep the event disabled, when going to SOFT_MODE. */ if (soft_disable) set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); if (tr->trace_flags & TRACE_ITER_RECORD_CMD) { cmd = true; tracing_start_cmdline_record(); set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { tgid = true; tracing_start_tgid_record(); set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); } ret = call->class->reg(call, TRACE_REG_REGISTER, file); if (ret) { if (cmd) tracing_stop_cmdline_record(); if (tgid) tracing_stop_tgid_record(); pr_info("event trace: Could not enable event " "%s\n", trace_event_name(call)); break; } set_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags); /* WAS_ENABLED gets set but never cleared. */ set_bit(EVENT_FILE_FL_WAS_ENABLED_BIT, &file->flags); } break; } /* Enable or disable use of trace_buffered_event */ if ((file_flags & EVENT_FILE_FL_SOFT_DISABLED) != (file->flags & EVENT_FILE_FL_SOFT_DISABLED)) { if (file->flags & EVENT_FILE_FL_SOFT_DISABLED) trace_buffered_event_enable(); else trace_buffered_event_disable(); } return ret; } int trace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable) { return __ftrace_event_enable_disable(file, enable, soft_disable); } static int ftrace_event_enable_disable(struct trace_event_file *file, int enable) { return __ftrace_event_enable_disable(file, enable, 0); } static void ftrace_clear_events(struct trace_array *tr) { struct trace_event_file *file; mutex_lock(&event_mutex); list_for_each_entry(file, &tr->events, list) { ftrace_event_enable_disable(file, 0); } mutex_unlock(&event_mutex); } static void event_filter_pid_sched_process_exit(void *data, struct task_struct *task) { struct trace_pid_list *pid_list; struct trace_array *tr = data; pid_list = rcu_dereference_raw(tr->filtered_pids); trace_filter_add_remove_task(pid_list, NULL, task); } static void event_filter_pid_sched_process_fork(void *data, struct task_struct *self, struct task_struct *task) { struct trace_pid_list *pid_list; struct trace_array *tr = data; pid_list = rcu_dereference_sched(tr->filtered_pids); trace_filter_add_remove_task(pid_list, self, task); } void trace_event_follow_fork(struct trace_array *tr, bool enable) { if (enable) { register_trace_prio_sched_process_fork(event_filter_pid_sched_process_fork, tr, INT_MIN); register_trace_prio_sched_process_free(event_filter_pid_sched_process_exit, tr, INT_MAX); } else { unregister_trace_sched_process_fork(event_filter_pid_sched_process_fork, tr); unregister_trace_sched_process_free(event_filter_pid_sched_process_exit, tr); } } static void event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, struct task_struct *prev, struct task_struct *next) { struct trace_array *tr = data; struct trace_pid_list *pid_list; pid_list = rcu_dereference_sched(tr->filtered_pids); this_cpu_write(tr->trace_buffer.data->ignore_pid, trace_ignore_this_task(pid_list, prev) && trace_ignore_this_task(pid_list, next)); } static void event_filter_pid_sched_switch_probe_post(void *data, bool preempt, struct task_struct *prev, struct task_struct *next) { struct trace_array *tr = data; struct trace_pid_list *pid_list; pid_list = rcu_dereference_sched(tr->filtered_pids); this_cpu_write(tr->trace_buffer.data->ignore_pid, trace_ignore_this_task(pid_list, next)); } static void event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task) { struct trace_array *tr = data; struct trace_pid_list *pid_list; /* Nothing to do if we are already tracing */ if (!this_cpu_read(tr->trace_buffer.data->ignore_pid)) return; pid_list = rcu_dereference_sched(tr->filtered_pids); this_cpu_write(tr->trace_buffer.data->ignore_pid, trace_ignore_this_task(pid_list, task)); } static void event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task) { struct trace_array *tr = data; struct trace_pid_list *pid_list; /* Nothing to do if we are not tracing */ if (this_cpu_read(tr->trace_buffer.data->ignore_pid)) return; pid_list = rcu_dereference_sched(tr->filtered_pids); /* Set tracing if current is enabled */ this_cpu_write(tr->trace_buffer.data->ignore_pid, trace_ignore_this_task(pid_list, current)); } static void __ftrace_clear_event_pids(struct trace_array *tr) { struct trace_pid_list *pid_list; struct trace_event_file *file; int cpu; pid_list = rcu_dereference_protected(tr->filtered_pids, lockdep_is_held(&event_mutex)); if (!pid_list) return; unregister_trace_sched_switch(event_filter_pid_sched_switch_probe_pre, tr); unregister_trace_sched_switch(event_filter_pid_sched_switch_probe_post, tr); unregister_trace_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre, tr); unregister_trace_sched_wakeup(event_filter_pid_sched_wakeup_probe_post, tr); unregister_trace_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_pre, tr); unregister_trace_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_post, tr); unregister_trace_sched_waking(event_filter_pid_sched_wakeup_probe_pre, tr); unregister_trace_sched_waking(event_filter_pid_sched_wakeup_probe_post, tr); list_for_each_entry(file, &tr->events, list) { clear_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags); } for_each_possible_cpu(cpu) per_cpu_ptr(tr->trace_buffer.data, cpu)->ignore_pid = false; rcu_assign_pointer(tr->filtered_pids, NULL); /* Wait till all users are no longer using pid filtering */ tracepoint_synchronize_unregister(); trace_free_pid_list(pid_list); } static void ftrace_clear_event_pids(struct trace_array *tr) { mutex_lock(&event_mutex); __ftrace_clear_event_pids(tr); mutex_unlock(&event_mutex); } static void __put_system(struct event_subsystem *system) { struct event_filter *filter = system->filter; WARN_ON_ONCE(system_refcount(system) == 0); if (system_refcount_dec(system)) return; list_del(&system->list); if (filter) { kfree(filter->filter_string); kfree(filter); } kfree_const(system->name); kfree(system); } static void __get_system(struct event_subsystem *system) { WARN_ON_ONCE(system_refcount(system) == 0); system_refcount_inc(system); } static void __get_system_dir(struct trace_subsystem_dir *dir) { WARN_ON_ONCE(dir->ref_count == 0); dir->ref_count++; __get_system(dir->subsystem); } static void __put_system_dir(struct trace_subsystem_dir *dir) { WARN_ON_ONCE(dir->ref_count == 0); /* If the subsystem is about to be freed, the dir must be too */ WARN_ON_ONCE(system_refcount(dir->subsystem) == 1 && dir->ref_count != 1); __put_system(dir->subsystem); if (!--dir->ref_count) kfree(dir); } static void put_system(struct trace_subsystem_dir *dir) { mutex_lock(&event_mutex); __put_system_dir(dir); mutex_unlock(&event_mutex); } static void remove_subsystem(struct trace_subsystem_dir *dir) { if (!dir) return; if (!--dir->nr_events) { tracefs_remove_recursive(dir->entry); list_del(&dir->list); __put_system_dir(dir); } } static void remove_event_file_dir(struct trace_event_file *file) { struct dentry *dir = file->dir; struct dentry *child; if (dir) { spin_lock(&dir->d_lock); /* probably unneeded */ list_for_each_entry(child, &dir->d_subdirs, d_child) { if (d_really_is_positive(child)) /* probably unneeded */ d_inode(child)->i_private = NULL; } spin_unlock(&dir->d_lock); tracefs_remove_recursive(dir); } list_del(&file->list); remove_subsystem(file->system); free_event_filter(file->filter); kmem_cache_free(file_cachep, file); } /* * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. */ static int __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match, const char *sub, const char *event, int set) { struct trace_event_file *file; struct trace_event_call *call; const char *name; int ret = -EINVAL; int eret = 0; list_for_each_entry(file, &tr->events, list) { call = file->event_call; name = trace_event_name(call); if (!name || !call->class || !call->class->reg) continue; if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) continue; if (match && strcmp(match, name) != 0 && strcmp(match, call->class->system) != 0) continue; if (sub && strcmp(sub, call->class->system) != 0) continue; if (event && strcmp(event, name) != 0) continue; ret = ftrace_event_enable_disable(file, set); /* * Save the first error and return that. Some events * may still have been enabled, but let the user * know that something went wrong. */ if (ret && !eret) eret = ret; ret = eret; } return ret; } static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, const char *sub, const char *event, int set) { int ret; mutex_lock(&event_mutex); ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set); mutex_unlock(&event_mutex); return ret; } static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) { char *event = NULL, *sub = NULL, *match; int ret; if (!tr) return -ENOENT; /* * The buf format can be <subsystem>:<event-name> * *:<event-name> means any event by that name. * :<event-name> is the same. * * <subsystem>:* means all events in that subsystem * <subsystem>: means the same. * * <name> (no ':') means all events in a subsystem with * the name <name> or any event that matches <name> */ match = strsep(&buf, ":"); if (buf) { sub = match; event = buf; match = NULL; if (!strlen(sub) || strcmp(sub, "*") == 0) sub = NULL; if (!strlen(event) || strcmp(event, "*") == 0) event = NULL; } ret = __ftrace_set_clr_event(tr, match, sub, event, set); /* Put back the colon to allow this to be called again */ if (buf) *(buf - 1) = ':'; return ret; } /** * trace_set_clr_event - enable or disable an event * @system: system name to match (NULL for any system) * @event: event name to match (NULL for all events, within system) * @set: 1 to enable, 0 to disable * * This is a way for other parts of the kernel to enable or disable * event recording. * * Returns 0 on success, -EINVAL if the parameters do not match any * registered events. */ int trace_set_clr_event(const char *system, const char *event, int set) { struct trace_array *tr = top_trace_array(); if (!tr) return -ENODEV; return __ftrace_set_clr_event(tr, NULL, system, event, set); } EXPORT_SYMBOL_GPL(trace_set_clr_event); /* 128 should be much more than enough */ #define EVENT_BUF_SIZE 127 static ssize_t ftrace_event_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_parser parser; struct seq_file *m = file->private_data; struct trace_array *tr = m->private; ssize_t read, ret; if (!cnt) return 0; ret = tracing_update_buffers(); if (ret < 0) return ret; if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1)) return -ENOMEM; read = trace_get_user(&parser, ubuf, cnt, ppos); if (read >= 0 && trace_parser_loaded((&parser))) { int set = 1; if (*parser.buffer == '!') set = 0; ret = ftrace_set_clr_event(tr, parser.buffer + !set, set); if (ret) goto out_put; } ret = read; out_put: trace_parser_put(&parser); return ret; } static void * t_next(struct seq_file *m, void *v, loff_t *pos) { struct trace_event_file *file = v; struct trace_event_call *call; struct trace_array *tr = m->private; (*pos)++; list_for_each_entry_continue(file, &tr->events, list) { call = file->event_call; /* * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. */ if (call->class && call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) return file; } return NULL; } static void *t_start(struct seq_file *m, loff_t *pos) { struct trace_event_file *file; struct trace_array *tr = m->private; loff_t l; mutex_lock(&event_mutex); file = list_entry(&tr->events, struct trace_event_file, list); for (l = 0; l <= *pos; ) { file = t_next(m, file, &l); if (!file) break; } return file; } static void * s_next(struct seq_file *m, void *v, loff_t *pos) { struct trace_event_file *file = v; struct trace_array *tr = m->private; (*pos)++; list_for_each_entry_continue(file, &tr->events, list) { if (file->flags & EVENT_FILE_FL_ENABLED) return file; } return NULL; } static void *s_start(struct seq_file *m, loff_t *pos) { struct trace_event_file *file; struct trace_array *tr = m->private; loff_t l; mutex_lock(&event_mutex); file = list_entry(&tr->events, struct trace_event_file, list); for (l = 0; l <= *pos; ) { file = s_next(m, file, &l); if (!file) break; } return file; } static int t_show(struct seq_file *m, void *v) { struct trace_event_file *file = v; struct trace_event_call *call = file->event_call; if (strcmp(call->class->system, TRACE_SYSTEM) != 0) seq_printf(m, "%s:", call->class->system); seq_printf(m, "%s\n", trace_event_name(call)); return 0; } static void t_stop(struct seq_file *m, void *p) { mutex_unlock(&event_mutex); } static void * p_next(struct seq_file *m, void *v, loff_t *pos) { struct trace_array *tr = m->private; struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids); return trace_pid_next(pid_list, v, pos); } static void *p_start(struct seq_file *m, loff_t *pos) __acquires(RCU) { struct trace_pid_list *pid_list; struct trace_array *tr = m->private; /* * Grab the mutex, to keep calls to p_next() having the same * tr->filtered_pids as p_start() has. * If we just passed the tr->filtered_pids around, then RCU would * have been enough, but doing that makes things more complex. */ mutex_lock(&event_mutex); rcu_read_lock_sched(); pid_list = rcu_dereference_sched(tr->filtered_pids); if (!pid_list) return NULL; return trace_pid_start(pid_list, pos); } static void p_stop(struct seq_file *m, void *p) __releases(RCU) { rcu_read_unlock_sched(); mutex_unlock(&event_mutex); } static ssize_t event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_event_file *file; unsigned long flags; char buf[4] = "0"; mutex_lock(&event_mutex); file = event_file_data(filp); if (likely(file)) flags = file->flags; mutex_unlock(&event_mutex); if (!file) return -ENODEV; if (flags & EVENT_FILE_FL_ENABLED && !(flags & EVENT_FILE_FL_SOFT_DISABLED)) strcpy(buf, "1"); if (flags & EVENT_FILE_FL_SOFT_DISABLED || flags & EVENT_FILE_FL_SOFT_MODE) strcat(buf, "*"); strcat(buf, "\n"); return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf)); } static ssize_t event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_event_file *file; unsigned long val; int ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; ret = tracing_update_buffers(); if (ret < 0) return ret; switch (val) { case 0: case 1: ret = -ENODEV; mutex_lock(&event_mutex); file = event_file_data(filp); if (likely(file)) ret = ftrace_event_enable_disable(file, val); mutex_unlock(&event_mutex); break; default: return -EINVAL; } *ppos += cnt; return ret ? ret : cnt; } static ssize_t system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { const char set_to_char[4] = { '?', '0', '1', 'X' }; struct trace_subsystem_dir *dir = filp->private_data; struct event_subsystem *system = dir->subsystem; struct trace_event_call *call; struct trace_event_file *file; struct trace_array *tr = dir->tr; char buf[2]; int set = 0; int ret; mutex_lock(&event_mutex); list_for_each_entry(file, &tr->events, list) { call = file->event_call; if ((call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) || !trace_event_name(call) || !call->class || !call->class->reg) continue; if (system && strcmp(call->class->system, system->name) != 0) continue; /* * We need to find out if all the events are set * or if all events or cleared, or if we have * a mixture. */ set |= (1 << !!(file->flags & EVENT_FILE_FL_ENABLED)); /* * If we have a mixture, no need to look further. */ if (set == 3) break; } mutex_unlock(&event_mutex); buf[0] = set_to_char[set]; buf[1] = '\n'; ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); return ret; } static ssize_t system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_subsystem_dir *dir = filp->private_data; struct event_subsystem *system = dir->subsystem; const char *name = NULL; unsigned long val; ssize_t ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; ret = tracing_update_buffers(); if (ret < 0) return ret; if (val != 0 && val != 1) return -EINVAL; /* * Opening of "enable" adds a ref count to system, * so the name is safe to use. */ if (system) name = system->name; ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val); if (ret) goto out; ret = cnt; out: *ppos += cnt; return ret; } enum { FORMAT_HEADER = 1, FORMAT_FIELD_SEPERATOR = 2, FORMAT_PRINTFMT = 3, }; static void *f_next(struct seq_file *m, void *v, loff_t *pos) { struct trace_event_call *call = event_file_data(m->private); struct list_head *common_head = &ftrace_common_fields; struct list_head *head = trace_get_fields(call); struct list_head *node = v; (*pos)++; switch ((unsigned long)v) { case FORMAT_HEADER: node = common_head; break; case FORMAT_FIELD_SEPERATOR: node = head; break; case FORMAT_PRINTFMT: /* all done */ return NULL; } node = node->prev; if (node == common_head) return (void *)FORMAT_FIELD_SEPERATOR; else if (node == head) return (void *)FORMAT_PRINTFMT; else return node; } static int f_show(struct seq_file *m, void *v) { struct trace_event_call *call = event_file_data(m->private); struct ftrace_event_field *field; const char *array_descriptor; switch ((unsigned long)v) { case FORMAT_HEADER: seq_printf(m, "name: %s\n", trace_event_name(call)); seq_printf(m, "ID: %d\n", call->event.type); seq_puts(m, "format:\n"); return 0; case FORMAT_FIELD_SEPERATOR: seq_putc(m, '\n'); return 0; case FORMAT_PRINTFMT: seq_printf(m, "\nprint fmt: %s\n", call->print_fmt); return 0; } field = list_entry(v, struct ftrace_event_field, link); /* * Smartly shows the array type(except dynamic array). * Normal: * field:TYPE VAR * If TYPE := TYPE[LEN], it is shown: * field:TYPE VAR[LEN] */ array_descriptor = strchr(field->type, '['); if (!strncmp(field->type, "__data_loc", 10)) array_descriptor = NULL; if (!array_descriptor) seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n", field->type, field->name, field->offset, field->size, !!field->is_signed); else seq_printf(m, "\tfield:%.*s %s%s;\toffset:%u;\tsize:%u;\tsigned:%d;\n", (int)(array_descriptor - field->type), field->type, field->name, array_descriptor, field->offset, field->size, !!field->is_signed); return 0; } static void *f_start(struct seq_file *m, loff_t *pos) { void *p = (void *)FORMAT_HEADER; loff_t l = 0; /* ->stop() is called even if ->start() fails */ mutex_lock(&event_mutex); if (!event_file_data(m->private)) return ERR_PTR(-ENODEV); while (l < *pos && p) p = f_next(m, p, &l); return p; } static void f_stop(struct seq_file *m, void *p) { mutex_unlock(&event_mutex); } static const struct seq_operations trace_format_seq_ops = { .start = f_start, .next = f_next, .stop = f_stop, .show = f_show, }; static int trace_format_open(struct inode *inode, struct file *file) { struct seq_file *m; int ret; ret = seq_open(file, &trace_format_seq_ops); if (ret < 0) return ret; m = file->private_data; m->private = file; return 0; } static ssize_t event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { int id = (long)event_file_data(filp); char buf[32]; int len; if (unlikely(!id)) return -ENODEV; len = sprintf(buf, "%d\n", id); return simple_read_from_buffer(ubuf, cnt, ppos, buf, len); } static ssize_t event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_event_file *file; struct trace_seq *s; int r = -ENODEV; if (*ppos) return 0; s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) return -ENOMEM; trace_seq_init(s); mutex_lock(&event_mutex); file = event_file_data(filp); if (file) print_event_filter(file, s); mutex_unlock(&event_mutex); if (file) r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s)); kfree(s); return r; } static ssize_t event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_event_file *file; char *buf; int err = -ENODEV; if (cnt >= PAGE_SIZE) return -EINVAL; buf = memdup_user_nul(ubuf, cnt); if (IS_ERR(buf)) return PTR_ERR(buf); mutex_lock(&event_mutex); file = event_file_data(filp); if (file) err = apply_event_filter(file, buf); mutex_unlock(&event_mutex); kfree(buf); if (err < 0) return err; *ppos += cnt; return cnt; } static LIST_HEAD(event_subsystems); static int subsystem_open(struct inode *inode, struct file *filp) { struct event_subsystem *system = NULL; struct trace_subsystem_dir *dir = NULL; /* Initialize for gcc */ struct trace_array *tr; int ret; if (tracing_is_disabled()) return -ENODEV; /* Make sure the system still exists */ mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { list_for_each_entry(dir, &tr->systems, list) { if (dir == inode->i_private) { /* Don't open systems with no events */ if (dir->nr_events) { __get_system_dir(dir); system = dir->subsystem; } goto exit_loop; } } } exit_loop: mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); if (!system) return -ENODEV; /* Some versions of gcc think dir can be uninitialized here */ WARN_ON(!dir); /* Still need to increment the ref count of the system */ if (trace_array_get(tr) < 0) { put_system(dir); return -ENODEV; } ret = tracing_open_generic(inode, filp); if (ret < 0) { trace_array_put(tr); put_system(dir); } return ret; } static int system_tr_open(struct inode *inode, struct file *filp) { struct trace_subsystem_dir *dir; struct trace_array *tr = inode->i_private; int ret; if (tracing_is_disabled()) return -ENODEV; if (trace_array_get(tr) < 0) return -ENODEV; /* Make a temporary dir that has no system but points to tr */ dir = kzalloc(sizeof(*dir), GFP_KERNEL); if (!dir) { trace_array_put(tr); return -ENOMEM; } dir->tr = tr; ret = tracing_open_generic(inode, filp); if (ret < 0) { trace_array_put(tr); kfree(dir); return ret; } filp->private_data = dir; return 0; } static int subsystem_release(struct inode *inode, struct file *file) { struct trace_subsystem_dir *dir = file->private_data; trace_array_put(dir->tr); /* * If dir->subsystem is NULL, then this is a temporary * descriptor that was made for a trace_array to enable * all subsystems. */ if (dir->subsystem) put_system(dir); else kfree(dir); return 0; } static ssize_t subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_subsystem_dir *dir = filp->private_data; struct event_subsystem *system = dir->subsystem; struct trace_seq *s; int r; if (*ppos) return 0; s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) return -ENOMEM; trace_seq_init(s); print_subsystem_event_filter(system, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s)); kfree(s); return r; } static ssize_t subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_subsystem_dir *dir = filp->private_data; char *buf; int err; if (cnt >= PAGE_SIZE) return -EINVAL; buf = memdup_user_nul(ubuf, cnt); if (IS_ERR(buf)) return PTR_ERR(buf); err = apply_subsystem_event_filter(dir, buf); kfree(buf); if (err < 0) return err; *ppos += cnt; return cnt; } static ssize_t show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { int (*func)(struct trace_seq *s) = filp->private_data; struct trace_seq *s; int r; if (*ppos) return 0; s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) return -ENOMEM; trace_seq_init(s); func(s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s)); kfree(s); return r; } static void ignore_task_cpu(void *data) { struct trace_array *tr = data; struct trace_pid_list *pid_list; /* * This function is called by on_each_cpu() while the * event_mutex is held. */ pid_list = rcu_dereference_protected(tr->filtered_pids, mutex_is_locked(&event_mutex)); this_cpu_write(tr->trace_buffer.data->ignore_pid, trace_ignore_this_task(pid_list, current)); } static ssize_t ftrace_event_pid_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct seq_file *m = filp->private_data; struct trace_array *tr = m->private; struct trace_pid_list *filtered_pids = NULL; struct trace_pid_list *pid_list; struct trace_event_file *file; ssize_t ret; if (!cnt) return 0; ret = tracing_update_buffers(); if (ret < 0) return ret; mutex_lock(&event_mutex); filtered_pids = rcu_dereference_protected(tr->filtered_pids, lockdep_is_held(&event_mutex)); ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); if (ret < 0) goto out; rcu_assign_pointer(tr->filtered_pids, pid_list); list_for_each_entry(file, &tr->events, list) { set_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags); } if (filtered_pids) { tracepoint_synchronize_unregister(); trace_free_pid_list(filtered_pids); } else if (pid_list) { /* * Register a probe that is called before all other probes * to set ignore_pid if next or prev do not match. * Register a probe this is called after all other probes * to only keep ignore_pid set if next pid matches. */ register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_pre, tr, INT_MAX); register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_post, tr, 0); register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre, tr, INT_MAX); register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_post, tr, 0); register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_pre, tr, INT_MAX); register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_post, tr, 0); register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_pre, tr, INT_MAX); register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_post, tr, 0); } /* * Ignoring of pids is done at task switch. But we have to * check for those tasks that are currently running. * Always do this in case a pid was appended or removed. */ on_each_cpu(ignore_task_cpu, tr, 1); out: mutex_unlock(&event_mutex); if (ret > 0) *ppos += ret; return ret; } static int ftrace_event_avail_open(struct inode *inode, struct file *file); static int ftrace_event_set_open(struct inode *inode, struct file *file); static int ftrace_event_set_pid_open(struct inode *inode, struct file *file); static int ftrace_event_release(struct inode *inode, struct file *file); static const struct seq_operations show_event_seq_ops = { .start = t_start, .next = t_next, .show = t_show, .stop = t_stop, }; static const struct seq_operations show_set_event_seq_ops = { .start = s_start, .next = s_next, .show = t_show, .stop = t_stop, }; static const struct seq_operations show_set_pid_seq_ops = { .start = p_start, .next = p_next, .show = trace_pid_show, .stop = p_stop, }; static const struct file_operations ftrace_avail_fops = { .open = ftrace_event_avail_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; static const struct file_operations ftrace_set_event_fops = { .open = ftrace_event_set_open, .read = seq_read, .write = ftrace_event_write, .llseek = seq_lseek, .release = ftrace_event_release, }; static const struct file_operations ftrace_set_event_pid_fops = { .open = ftrace_event_set_pid_open, .read = seq_read, .write = ftrace_event_pid_write, .llseek = seq_lseek, .release = ftrace_event_release, }; static const struct file_operations ftrace_enable_fops = { .open = tracing_open_generic, .read = event_enable_read, .write = event_enable_write, .llseek = default_llseek, }; static const struct file_operations ftrace_event_format_fops = { .open = trace_format_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; static const struct file_operations ftrace_event_id_fops = { .read = event_id_read, .llseek = default_llseek, }; static const struct file_operations ftrace_event_filter_fops = { .open = tracing_open_generic, .read = event_filter_read, .write = event_filter_write, .llseek = default_llseek, }; static const struct file_operations ftrace_subsystem_filter_fops = { .open = subsystem_open, .read = subsystem_filter_read, .write = subsystem_filter_write, .llseek = default_llseek, .release = subsystem_release, }; static const struct file_operations ftrace_system_enable_fops = { .open = subsystem_open, .read = system_enable_read, .write = system_enable_write, .llseek = default_llseek, .release = subsystem_release, }; static const struct file_operations ftrace_tr_enable_fops = { .open = system_tr_open, .read = system_enable_read, .write = system_enable_write, .llseek = default_llseek, .release = subsystem_release, }; static const struct file_operations ftrace_show_header_fops = { .open = tracing_open_generic, .read = show_header, .llseek = default_llseek, }; static int ftrace_event_open(struct inode *inode, struct file *file, const struct seq_operations *seq_ops) { struct seq_file *m; int ret; ret = seq_open(file, seq_ops); if (ret < 0) return ret; m = file->private_data; /* copy tr over to seq ops */ m->private = inode->i_private; return ret; } static int ftrace_event_release(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; trace_array_put(tr); return seq_release(inode, file); } static int ftrace_event_avail_open(struct inode *inode, struct file *file) { const struct seq_operations *seq_ops = &show_event_seq_ops; return ftrace_event_open(inode, file, seq_ops); } static int ftrace_event_set_open(struct inode *inode, struct file *file) { const struct seq_operations *seq_ops = &show_set_event_seq_ops; struct trace_array *tr = inode->i_private; int ret; if (trace_array_get(tr) < 0) return -ENODEV; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) ftrace_clear_events(tr); ret = ftrace_event_open(inode, file, seq_ops); if (ret < 0) trace_array_put(tr); return ret; } static int ftrace_event_set_pid_open(struct inode *inode, struct file *file) { const struct seq_operations *seq_ops = &show_set_pid_seq_ops; struct trace_array *tr = inode->i_private; int ret; if (trace_array_get(tr) < 0) return -ENODEV; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) ftrace_clear_event_pids(tr); ret = ftrace_event_open(inode, file, seq_ops); if (ret < 0) trace_array_put(tr); return ret; } static struct event_subsystem * create_new_subsystem(const char *name) { struct event_subsystem *system; /* need to create new entry */ system = kmalloc(sizeof(*system), GFP_KERNEL); if (!system) return NULL; system->ref_count = 1; /* Only allocate if dynamic (kprobes and modules) */ system->name = kstrdup_const(name, GFP_KERNEL); if (!system->name) goto out_free; system->filter = NULL; system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); if (!system->filter) goto out_free; list_add(&system->list, &event_subsystems); return system; out_free: kfree_const(system->name); kfree(system); return NULL; } static struct dentry * event_subsystem_dir(struct trace_array *tr, const char *name, struct trace_event_file *file, struct dentry *parent) { struct trace_subsystem_dir *dir; struct event_subsystem *system; struct dentry *entry; /* First see if we did not already create this dir */ list_for_each_entry(dir, &tr->systems, list) { system = dir->subsystem; if (strcmp(system->name, name) == 0) { dir->nr_events++; file->system = dir; return dir->entry; } } /* Now see if the system itself exists. */ list_for_each_entry(system, &event_subsystems, list) { if (strcmp(system->name, name) == 0) break; } /* Reset system variable when not found */ if (&system->list == &event_subsystems) system = NULL; dir = kmalloc(sizeof(*dir), GFP_KERNEL); if (!dir) goto out_fail; if (!system) { system = create_new_subsystem(name); if (!system) goto out_free; } else __get_system(system); dir->entry = tracefs_create_dir(name, parent); if (!dir->entry) { pr_warn("Failed to create system directory %s\n", name); __put_system(system); goto out_free; } dir->tr = tr; dir->ref_count = 1; dir->nr_events = 1; dir->subsystem = system; file->system = dir; entry = tracefs_create_file("filter", 0644, dir->entry, dir, &ftrace_subsystem_filter_fops); if (!entry) { kfree(system->filter); system->filter = NULL; pr_warn("Could not create tracefs '%s/filter' entry\n", name); } trace_create_file("enable", 0644, dir->entry, dir, &ftrace_system_enable_fops); list_add(&dir->list, &tr->systems); return dir->entry; out_free: kfree(dir); out_fail: /* Only print this message if failed on memory allocation */ if (!dir || !system) pr_warn("No memory to create event subsystem %s\n", name); return NULL; } static int event_create_dir(struct dentry *parent, struct trace_event_file *file) { struct trace_event_call *call = file->event_call; struct trace_array *tr = file->tr; struct list_head *head; struct dentry *d_events; const char *name; int ret; /* * If the trace point header did not define TRACE_SYSTEM * then the system would be called "TRACE_SYSTEM". */ if (strcmp(call->class->system, TRACE_SYSTEM) != 0) { d_events = event_subsystem_dir(tr, call->class->system, file, parent); if (!d_events) return -ENOMEM; } else d_events = parent; name = trace_event_name(call); file->dir = tracefs_create_dir(name, d_events); if (!file->dir) { pr_warn("Could not create tracefs '%s' directory\n", name); return -1; } if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) trace_create_file("enable", 0644, file->dir, file, &ftrace_enable_fops); #ifdef CONFIG_PERF_EVENTS if (call->event.type && call->class->reg) trace_create_file("id", 0444, file->dir, (void *)(long)call->event.type, &ftrace_event_id_fops); #endif /* * Other events may have the same class. Only update * the fields if they are not already defined. */ head = trace_get_fields(call); if (list_empty(head)) { ret = call->class->define_fields(call); if (ret < 0) { pr_warn("Could not initialize trace point events/%s\n", name); return -1; } } /* * Only event directories that can be enabled should have * triggers or filters. */ if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) { trace_create_file("filter", 0644, file->dir, file, &ftrace_event_filter_fops); trace_create_file("trigger", 0644, file->dir, file, &event_trigger_fops); } #ifdef CONFIG_HIST_TRIGGERS trace_create_file("hist", 0444, file->dir, file, &event_hist_fops); #endif trace_create_file("format", 0444, file->dir, call, &ftrace_event_format_fops); return 0; } static void remove_event_from_tracers(struct trace_event_call *call) { struct trace_event_file *file; struct trace_array *tr; do_for_each_event_file_safe(tr, file) { if (file->event_call != call) continue; remove_event_file_dir(file); /* * The do_for_each_event_file_safe() is * a double loop. After finding the call for this * trace_array, we use break to jump to the next * trace_array. */ break; } while_for_each_event_file(); } static void event_remove(struct trace_event_call *call) { struct trace_array *tr; struct trace_event_file *file; do_for_each_event_file(tr, file) { if (file->event_call != call) continue; if (file->flags & EVENT_FILE_FL_WAS_ENABLED) tr->clear_trace = true; ftrace_event_enable_disable(file, 0); /* * The do_for_each_event_file() is * a double loop. After finding the call for this * trace_array, we use break to jump to the next * trace_array. */ break; } while_for_each_event_file(); if (call->event.funcs) __unregister_trace_event(&call->event); remove_event_from_tracers(call); list_del(&call->list); } static int event_init(struct trace_event_call *call) { int ret = 0; const char *name; name = trace_event_name(call); if (WARN_ON(!name)) return -EINVAL; if (call->class->raw_init) { ret = call->class->raw_init(call); if (ret < 0 && ret != -ENOSYS) pr_warn("Could not initialize trace events/%s\n", name); } return ret; } static int __register_event(struct trace_event_call *call, struct module *mod) { int ret; ret = event_init(call); if (ret < 0) return ret; list_add(&call->list, &ftrace_events); call->mod = mod; return 0; } static char *eval_replace(char *ptr, struct trace_eval_map *map, int len) { int rlen; int elen; /* Find the length of the eval value as a string */ elen = snprintf(ptr, 0, "%ld", map->eval_value); /* Make sure there's enough room to replace the string with the value */ if (len < elen) return NULL; snprintf(ptr, elen + 1, "%ld", map->eval_value); /* Get the rest of the string of ptr */ rlen = strlen(ptr + len); memmove(ptr + elen, ptr + len, rlen); /* Make sure we end the new string */ ptr[elen + rlen] = 0; return ptr + elen; } static void update_event_printk(struct trace_event_call *call, struct trace_eval_map *map) { char *ptr; int quote = 0; int len = strlen(map->eval_string); for (ptr = call->print_fmt; *ptr; ptr++) { if (*ptr == '\\') { ptr++; /* paranoid */ if (!*ptr) break; continue; } if (*ptr == '"') { quote ^= 1; continue; } if (quote) continue; if (isdigit(*ptr)) { /* skip numbers */ do { ptr++; /* Check for alpha chars like ULL */ } while (isalnum(*ptr)); if (!*ptr) break; /* * A number must have some kind of delimiter after * it, and we can ignore that too. */ continue; } if (isalpha(*ptr) || *ptr == '_') { if (strncmp(map->eval_string, ptr, len) == 0 && !isalnum(ptr[len]) && ptr[len] != '_') { ptr = eval_replace(ptr, map, len); /* enum/sizeof string smaller than value */ if (WARN_ON_ONCE(!ptr)) return; /* * No need to decrement here, as eval_replace() * returns the pointer to the character passed * the eval, and two evals can not be placed * back to back without something in between. * We can skip that something in between. */ continue; } skip_more: do { ptr++; } while (isalnum(*ptr) || *ptr == '_'); if (!*ptr) break; /* * If what comes after this variable is a '.' or * '->' then we can continue to ignore that string. */ if (*ptr == '.' || (ptr[0] == '-' && ptr[1] == '>')) { ptr += *ptr == '.' ? 1 : 2; if (!*ptr) break; goto skip_more; } /* * Once again, we can skip the delimiter that came * after the string. */ continue; } } } void trace_event_eval_update(struct trace_eval_map **map, int len) { struct trace_event_call *call, *p; const char *last_system = NULL; bool first = false; int last_i; int i; down_write(&trace_event_sem); list_for_each_entry_safe(call, p, &ftrace_events, list) { /* events are usually grouped together with systems */ if (!last_system || call->class->system != last_system) { first = true; last_i = 0; last_system = call->class->system; } /* * Since calls are grouped by systems, the likelyhood that the * next call in the iteration belongs to the same system as the * previous call is high. As an optimization, we skip seaching * for a map[] that matches the call's system if the last call * was from the same system. That's what last_i is for. If the * call has the same system as the previous call, then last_i * will be the index of the first map[] that has a matching * system. */ for (i = last_i; i < len; i++) { if (call->class->system == map[i]->system) { /* Save the first system if need be */ if (first) { last_i = i; first = false; } update_event_printk(call, map[i]); } } } up_write(&trace_event_sem); } static struct trace_event_file * trace_create_new_event(struct trace_event_call *call, struct trace_array *tr) { struct trace_pid_list *pid_list; struct trace_event_file *file; file = kmem_cache_alloc(file_cachep, GFP_TRACE); if (!file) return NULL; pid_list = rcu_dereference_protected(tr->filtered_pids, lockdep_is_held(&event_mutex)); if (pid_list) file->flags |= EVENT_FILE_FL_PID_FILTER; file->event_call = call; file->tr = tr; atomic_set(&file->sm_ref, 0); atomic_set(&file->tm_ref, 0); INIT_LIST_HEAD(&file->triggers); list_add(&file->list, &tr->events); return file; } /* Add an event to a trace directory */ static int __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr) { struct trace_event_file *file; file = trace_create_new_event(call, tr); if (!file) return -ENOMEM; return event_create_dir(tr->event_dir, file); } /* * Just create a decriptor for early init. A descriptor is required * for enabling events at boot. We want to enable events before * the filesystem is initialized. */ static __init int __trace_early_add_new_event(struct trace_event_call *call, struct trace_array *tr) { struct trace_event_file *file; file = trace_create_new_event(call, tr); if (!file) return -ENOMEM; return 0; } struct ftrace_module_file_ops; static void __add_event_to_tracers(struct trace_event_call *call); int trace_add_event_call_nolock(struct trace_event_call *call) { int ret; lockdep_assert_held(&event_mutex); mutex_lock(&trace_types_lock); ret = __register_event(call, NULL); if (ret >= 0) __add_event_to_tracers(call); mutex_unlock(&trace_types_lock); return ret; } /* Add an additional event_call dynamically */ int trace_add_event_call(struct trace_event_call *call) { int ret; mutex_lock(&event_mutex); ret = trace_add_event_call_nolock(call); mutex_unlock(&event_mutex); return ret; } /* * Must be called under locking of trace_types_lock, event_mutex and * trace_event_sem. */ static void __trace_remove_event_call(struct trace_event_call *call) { event_remove(call); trace_destroy_fields(call); free_event_filter(call->filter); call->filter = NULL; } static int probe_remove_event_call(struct trace_event_call *call) { struct trace_array *tr; struct trace_event_file *file; #ifdef CONFIG_PERF_EVENTS if (call->perf_refcount) return -EBUSY; #endif do_for_each_event_file(tr, file) { if (file->event_call != call) continue; /* * We can't rely on ftrace_event_enable_disable(enable => 0) * we are going to do, EVENT_FILE_FL_SOFT_MODE can suppress * TRACE_REG_UNREGISTER. */ if (file->flags & EVENT_FILE_FL_ENABLED) return -EBUSY; /* * The do_for_each_event_file_safe() is * a double loop. After finding the call for this * trace_array, we use break to jump to the next * trace_array. */ break; } while_for_each_event_file(); __trace_remove_event_call(call); return 0; } /* no event_mutex version */ int trace_remove_event_call_nolock(struct trace_event_call *call) { int ret; lockdep_assert_held(&event_mutex); mutex_lock(&trace_types_lock); down_write(&trace_event_sem); ret = probe_remove_event_call(call); up_write(&trace_event_sem); mutex_unlock(&trace_types_lock); return ret; } /* Remove an event_call */ int trace_remove_event_call(struct trace_event_call *call) { int ret; mutex_lock(&event_mutex); ret = trace_remove_event_call_nolock(call); mutex_unlock(&event_mutex); return ret; } #define for_each_event(event, start, end) \ for (event = start; \ (unsigned long)event < (unsigned long)end; \ event++) #ifdef CONFIG_MODULES static void trace_module_add_events(struct module *mod) { struct trace_event_call **call, **start, **end; if (!mod->num_trace_events) return; /* Don't add infrastructure for mods without tracepoints */ if (trace_module_has_bad_taint(mod)) { pr_err("%s: module has bad taint, not creating trace events\n", mod->name); return; } start = mod->trace_events; end = mod->trace_events + mod->num_trace_events; for_each_event(call, start, end) { __register_event(*call, mod); __add_event_to_tracers(*call); } } static void trace_module_remove_events(struct module *mod) { struct trace_event_call *call, *p; down_write(&trace_event_sem); list_for_each_entry_safe(call, p, &ftrace_events, list) { if (call->mod == mod) __trace_remove_event_call(call); } up_write(&trace_event_sem); /* * It is safest to reset the ring buffer if the module being unloaded * registered any events that were used. The only worry is if * a new module gets loaded, and takes on the same id as the events * of this module. When printing out the buffer, traced events left * over from this module may be passed to the new module events and * unexpected results may occur. */ tracing_reset_all_online_cpus(); } static int trace_module_notify(struct notifier_block *self, unsigned long val, void *data) { struct module *mod = data; mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); switch (val) { case MODULE_STATE_COMING: trace_module_add_events(mod); break; case MODULE_STATE_GOING: trace_module_remove_events(mod); break; } mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); return 0; } static struct notifier_block trace_module_nb = { .notifier_call = trace_module_notify, .priority = 1, /* higher than trace.c module notify */ }; #endif /* CONFIG_MODULES */ /* Create a new event directory structure for a trace directory. */ static void __trace_add_event_dirs(struct trace_array *tr) { struct trace_event_call *call; int ret; list_for_each_entry(call, &ftrace_events, list) { ret = __trace_add_new_event(call, tr); if (ret < 0) pr_warn("Could not create directory for event %s\n", trace_event_name(call)); } } /* Returns any file that matches the system and event */ struct trace_event_file * __find_event_file(struct trace_array *tr, const char *system, const char *event) { struct trace_event_file *file; struct trace_event_call *call; const char *name; list_for_each_entry(file, &tr->events, list) { call = file->event_call; name = trace_event_name(call); if (!name || !call->class) continue; if (strcmp(event, name) == 0 && strcmp(system, call->class->system) == 0) return file; } return NULL; } /* Returns valid trace event files that match system and event */ struct trace_event_file * find_event_file(struct trace_array *tr, const char *system, const char *event) { struct trace_event_file *file; file = __find_event_file(tr, system, event); if (!file || !file->event_call->class->reg || file->event_call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) return NULL; return file; } #ifdef CONFIG_DYNAMIC_FTRACE /* Avoid typos */ #define ENABLE_EVENT_STR "enable_event" #define DISABLE_EVENT_STR "disable_event" struct event_probe_data { struct trace_event_file *file; unsigned long count; int ref; bool enable; }; static void update_event_probe(struct event_probe_data *data) { if (data->enable) clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags); else set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags); } static void event_enable_probe(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, void *data) { struct ftrace_func_mapper *mapper = data; struct event_probe_data *edata; void **pdata; pdata = ftrace_func_mapper_find_ip(mapper, ip); if (!pdata || !*pdata) return; edata = *pdata; update_event_probe(edata); } static void event_enable_count_probe(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, void *data) { struct ftrace_func_mapper *mapper = data; struct event_probe_data *edata; void **pdata; pdata = ftrace_func_mapper_find_ip(mapper, ip); if (!pdata || !*pdata) return; edata = *pdata; if (!edata->count) return; /* Skip if the event is in a state we want to switch to */ if (edata->enable == !(edata->file->flags & EVENT_FILE_FL_SOFT_DISABLED)) return; if (edata->count != -1) (edata->count)--; update_event_probe(edata); } static int event_enable_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { struct ftrace_func_mapper *mapper = data; struct event_probe_data *edata; void **pdata; pdata = ftrace_func_mapper_find_ip(mapper, ip); if (WARN_ON_ONCE(!pdata || !*pdata)) return 0; edata = *pdata; seq_printf(m, "%ps:", (void *)ip); seq_printf(m, "%s:%s:%s", edata->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, edata->file->event_call->class->system, trace_event_name(edata->file->event_call)); if (edata->count == -1) seq_puts(m, ":unlimited\n"); else seq_printf(m, ":count=%ld\n", edata->count); return 0; } static int event_enable_init(struct ftrace_probe_ops *ops, struct trace_array *tr, unsigned long ip, void *init_data, void **data) { struct ftrace_func_mapper *mapper = *data; struct event_probe_data *edata = init_data; int ret; if (!mapper) { mapper = allocate_ftrace_func_mapper(); if (!mapper) return -ENODEV; *data = mapper; } ret = ftrace_func_mapper_add_ip(mapper, ip, edata); if (ret < 0) return ret; edata->ref++; return 0; } static int free_probe_data(void *data) { struct event_probe_data *edata = data; edata->ref--; if (!edata->ref) { /* Remove the SOFT_MODE flag */ __ftrace_event_enable_disable(edata->file, 0, 1); module_put(edata->file->event_call->mod); kfree(edata); } return 0; } static void event_enable_free(struct ftrace_probe_ops *ops, struct trace_array *tr, unsigned long ip, void *data) { struct ftrace_func_mapper *mapper = data; struct event_probe_data *edata; if (!ip) { if (!mapper) return; free_ftrace_func_mapper(mapper, free_probe_data); return; } edata = ftrace_func_mapper_remove_ip(mapper, ip); if (WARN_ON_ONCE(!edata)) return; if (WARN_ON_ONCE(edata->ref <= 0)) return; free_probe_data(edata); } static struct ftrace_probe_ops event_enable_probe_ops = { .func = event_enable_probe, .print = event_enable_print, .init = event_enable_init, .free = event_enable_free, }; static struct ftrace_probe_ops event_enable_count_probe_ops = { .func = event_enable_count_probe, .print = event_enable_print, .init = event_enable_init, .free = event_enable_free, }; static struct ftrace_probe_ops event_disable_probe_ops = { .func = event_enable_probe, .print = event_enable_print, .init = event_enable_init, .free = event_enable_free, }; static struct ftrace_probe_ops event_disable_count_probe_ops = { .func = event_enable_count_probe, .print = event_enable_print, .init = event_enable_init, .free = event_enable_free, }; static int event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enabled) { struct trace_event_file *file; struct ftrace_probe_ops *ops; struct event_probe_data *data; const char *system; const char *event; char *number; bool enable; int ret; if (!tr) return -ENODEV; /* hash funcs only work with set_ftrace_filter */ if (!enabled || !param) return -EINVAL; system = strsep(¶m, ":"); if (!param) return -EINVAL; event = strsep(¶m, ":"); mutex_lock(&event_mutex); ret = -EINVAL; file = find_event_file(tr, system, event); if (!file) goto out; enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; if (enable) ops = param ? &event_enable_count_probe_ops : &event_enable_probe_ops; else ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops; if (glob[0] == '!') { ret = unregister_ftrace_function_probe_func(glob+1, tr, ops); goto out; } ret = -ENOMEM; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) goto out; data->enable = enable; data->count = -1; data->file = file; if (!param) goto out_reg; number = strsep(¶m, ":"); ret = -EINVAL; if (!strlen(number)) goto out_free; /* * We use the callback data field (which is a pointer) * as our counter. */ ret = kstrtoul(number, 0, &data->count); if (ret) goto out_free; out_reg: /* Don't let event modules unload while probe registered */ ret = try_module_get(file->event_call->mod); if (!ret) { ret = -EBUSY; goto out_free; } ret = __ftrace_event_enable_disable(file, 1, 1); if (ret < 0) goto out_put; ret = register_ftrace_function_probe(glob, tr, ops, data); /* * The above returns on success the # of functions enabled, * but if it didn't find any functions it returns zero. * Consider no functions a failure too. */ if (!ret) { ret = -ENOENT; goto out_disable; } else if (ret < 0) goto out_disable; /* Just return zero, not the number of enabled functions */ ret = 0; out: mutex_unlock(&event_mutex); return ret; out_disable: __ftrace_event_enable_disable(file, 0, 1); out_put: module_put(file->event_call->mod); out_free: kfree(data); goto out; } static struct ftrace_func_command event_enable_cmd = { .name = ENABLE_EVENT_STR, .func = event_enable_func, }; static struct ftrace_func_command event_disable_cmd = { .name = DISABLE_EVENT_STR, .func = event_enable_func, }; static __init int register_event_cmds(void) { int ret; ret = register_ftrace_command(&event_enable_cmd); if (WARN_ON(ret < 0)) return ret; ret = register_ftrace_command(&event_disable_cmd); if (WARN_ON(ret < 0)) unregister_ftrace_command(&event_enable_cmd); return ret; } #else static inline int register_event_cmds(void) { return 0; } #endif /* CONFIG_DYNAMIC_FTRACE */ /* * The top level array has already had its trace_event_file * descriptors created in order to allow for early events to * be recorded. This function is called after the tracefs has been * initialized, and we now have to create the files associated * to the events. */ static __init void __trace_early_add_event_dirs(struct trace_array *tr) { struct trace_event_file *file; int ret; list_for_each_entry(file, &tr->events, list) { ret = event_create_dir(tr->event_dir, file); if (ret < 0) pr_warn("Could not create directory for event %s\n", trace_event_name(file->event_call)); } } /* * For early boot up, the top trace array requires to have * a list of events that can be enabled. This must be done before * the filesystem is set up in order to allow events to be traced * early. */ static __init void __trace_early_add_events(struct trace_array *tr) { struct trace_event_call *call; int ret; list_for_each_entry(call, &ftrace_events, list) { /* Early boot up should not have any modules loaded */ if (WARN_ON_ONCE(call->mod)) continue; ret = __trace_early_add_new_event(call, tr); if (ret < 0) pr_warn("Could not create early event %s\n", trace_event_name(call)); } } /* Remove the event directory structure for a trace directory. */ static void __trace_remove_event_dirs(struct trace_array *tr) { struct trace_event_file *file, *next; list_for_each_entry_safe(file, next, &tr->events, list) remove_event_file_dir(file); } static void __add_event_to_tracers(struct trace_event_call *call) { struct trace_array *tr; list_for_each_entry(tr, &ftrace_trace_arrays, list) __trace_add_new_event(call, tr); } extern struct trace_event_call *__start_ftrace_events[]; extern struct trace_event_call *__stop_ftrace_events[]; static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; static __init int setup_trace_event(char *str) { strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE); ring_buffer_expanded = true; tracing_selftest_disabled = true; return 1; } __setup("trace_event=", setup_trace_event); /* Expects to have event_mutex held when called */ static int create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) { struct dentry *d_events; struct dentry *entry; entry = tracefs_create_file("set_event", 0644, parent, tr, &ftrace_set_event_fops); if (!entry) { pr_warn("Could not create tracefs 'set_event' entry\n"); return -ENOMEM; } d_events = tracefs_create_dir("events", parent); if (!d_events) { pr_warn("Could not create tracefs 'events' directory\n"); return -ENOMEM; } entry = trace_create_file("enable", 0644, d_events, tr, &ftrace_tr_enable_fops); if (!entry) { pr_warn("Could not create tracefs 'enable' entry\n"); return -ENOMEM; } /* There are not as crucial, just warn if they are not created */ entry = tracefs_create_file("set_event_pid", 0644, parent, tr, &ftrace_set_event_pid_fops); if (!entry) pr_warn("Could not create tracefs 'set_event_pid' entry\n"); /* ring buffer internal formats */ entry = trace_create_file("header_page", 0444, d_events, ring_buffer_print_page_header, &ftrace_show_header_fops); if (!entry) pr_warn("Could not create tracefs 'header_page' entry\n"); entry = trace_create_file("header_event", 0444, d_events, ring_buffer_print_entry_header, &ftrace_show_header_fops); if (!entry) pr_warn("Could not create tracefs 'header_event' entry\n"); tr->event_dir = d_events; return 0; } /** * event_trace_add_tracer - add a instance of a trace_array to events * @parent: The parent dentry to place the files/directories for events in * @tr: The trace array associated with these events * * When a new instance is created, it needs to set up its events * directory, as well as other files associated with events. It also * creates the event hierachry in the @parent/events directory. * * Returns 0 on success. * * Must be called with event_mutex held. */ int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr) { int ret; lockdep_assert_held(&event_mutex); ret = create_event_toplevel_files(parent, tr); if (ret) goto out; down_write(&trace_event_sem); __trace_add_event_dirs(tr); up_write(&trace_event_sem); out: return ret; } /* * The top trace array already had its file descriptors created. * Now the files themselves need to be created. */ static __init int early_event_add_tracer(struct dentry *parent, struct trace_array *tr) { int ret; mutex_lock(&event_mutex); ret = create_event_toplevel_files(parent, tr); if (ret) goto out_unlock; down_write(&trace_event_sem); __trace_early_add_event_dirs(tr); up_write(&trace_event_sem); out_unlock: mutex_unlock(&event_mutex); return ret; } /* Must be called with event_mutex held */ int event_trace_del_tracer(struct trace_array *tr) { lockdep_assert_held(&event_mutex); /* Disable any event triggers and associated soft-disabled events */ clear_event_triggers(tr); /* Clear the pid list */ __ftrace_clear_event_pids(tr); /* Disable any running events */ __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); /* Make sure no more events are being executed */ tracepoint_synchronize_unregister(); down_write(&trace_event_sem); __trace_remove_event_dirs(tr); tracefs_remove_recursive(tr->event_dir); up_write(&trace_event_sem); tr->event_dir = NULL; return 0; } static __init int event_trace_memsetup(void) { field_cachep = KMEM_CACHE(ftrace_event_field, SLAB_PANIC); file_cachep = KMEM_CACHE(trace_event_file, SLAB_PANIC); return 0; } static __init void early_enable_events(struct trace_array *tr, bool disable_first) { char *buf = bootup_event_buf; char *token; int ret; while (true) { token = strsep(&buf, ","); if (!token) break; if (*token) { /* Restarting syscalls requires that we stop them first */ if (disable_first) ftrace_set_clr_event(tr, token, 0); ret = ftrace_set_clr_event(tr, token, 1); if (ret) pr_warn("Failed to enable trace event: %s\n", token); } /* Put back the comma to allow this to be called again */ if (buf) *(buf - 1) = ','; } } static __init int event_trace_enable(void) { struct trace_array *tr = top_trace_array(); struct trace_event_call **iter, *call; int ret; if (!tr) return -ENODEV; for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) { call = *iter; ret = event_init(call); if (!ret) list_add(&call->list, &ftrace_events); } /* * We need the top trace array to have a working set of trace * points at early init, before the debug files and directories * are created. Create the file entries now, and attach them * to the actual file dentries later. */ __trace_early_add_events(tr); early_enable_events(tr, false); trace_printk_start_comm(); register_event_cmds(); register_trigger_cmds(); return 0; } /* * event_trace_enable() is called from trace_event_init() first to * initialize events and perhaps start any events that are on the * command line. Unfortunately, there are some events that will not * start this early, like the system call tracepoints that need * to set the TIF_SYSCALL_TRACEPOINT flag of pid 1. But event_trace_enable() * is called before pid 1 starts, and this flag is never set, making * the syscall tracepoint never get reached, but the event is enabled * regardless (and not doing anything). */ static __init int event_trace_enable_again(void) { struct trace_array *tr; tr = top_trace_array(); if (!tr) return -ENODEV; early_enable_events(tr, true); return 0; } early_initcall(event_trace_enable_again); __init int event_trace_init(void) { struct trace_array *tr; struct dentry *d_tracer; struct dentry *entry; int ret; tr = top_trace_array(); if (!tr) return -ENODEV; d_tracer = tracing_init_dentry(); if (IS_ERR(d_tracer)) return 0; entry = tracefs_create_file("available_events", 0444, d_tracer, tr, &ftrace_avail_fops); if (!entry) pr_warn("Could not create tracefs 'available_events' entry\n"); if (trace_define_generic_fields()) pr_warn("tracing: Failed to allocated generic fields"); if (trace_define_common_fields()) pr_warn("tracing: Failed to allocate common fields"); ret = early_event_add_tracer(d_tracer, tr); if (ret) return ret; #ifdef CONFIG_MODULES ret = register_module_notifier(&trace_module_nb); if (ret) pr_warn("Failed to register trace events module notifier\n"); #endif return 0; } void __init trace_event_init(void) { event_trace_memsetup(); init_ftrace_syscalls(); event_trace_enable(); } #ifdef CONFIG_FTRACE_STARTUP_TEST static DEFINE_SPINLOCK(test_spinlock); static DEFINE_SPINLOCK(test_spinlock_irq); static DEFINE_MUTEX(test_mutex); static __init void test_work(struct work_struct *dummy) { spin_lock(&test_spinlock); spin_lock_irq(&test_spinlock_irq); udelay(1); spin_unlock_irq(&test_spinlock_irq); spin_unlock(&test_spinlock); mutex_lock(&test_mutex); msleep(1); mutex_unlock(&test_mutex); } static __init int event_test_thread(void *unused) { void *test_malloc; test_malloc = kmalloc(1234, GFP_KERNEL); if (!test_malloc) pr_info("failed to kmalloc\n"); schedule_on_each_cpu(test_work); kfree(test_malloc); set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { schedule(); set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); return 0; } /* * Do various things that may trigger events. */ static __init void event_test_stuff(void) { struct task_struct *test_thread; test_thread = kthread_run(event_test_thread, NULL, "test-events"); msleep(1); kthread_stop(test_thread); } /* * For every trace event defined, we will test each trace point separately, * and then by groups, and finally all trace points. */ static __init void event_trace_self_tests(void) { struct trace_subsystem_dir *dir; struct trace_event_file *file; struct trace_event_call *call; struct event_subsystem *system; struct trace_array *tr; int ret; tr = top_trace_array(); if (!tr) return; pr_info("Running tests on trace events:\n"); list_for_each_entry(file, &tr->events, list) { call = file->event_call; /* Only test those that have a probe */ if (!call->class || !call->class->probe) continue; /* * Testing syscall events here is pretty useless, but * we still do it if configured. But this is time consuming. * What we really need is a user thread to perform the * syscalls as we test. */ #ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS if (call->class->system && strcmp(call->class->system, "syscalls") == 0) continue; #endif pr_info("Testing event %s: ", trace_event_name(call)); /* * If an event is already enabled, someone is using * it and the self test should not be on. */ if (file->flags & EVENT_FILE_FL_ENABLED) { pr_warn("Enabled event during self test!\n"); WARN_ON_ONCE(1); continue; } ftrace_event_enable_disable(file, 1); event_test_stuff(); ftrace_event_enable_disable(file, 0); pr_cont("OK\n"); } /* Now test at the sub system level */ pr_info("Running tests on trace event systems:\n"); list_for_each_entry(dir, &tr->systems, list) { system = dir->subsystem; /* the ftrace system is special, skip it */ if (strcmp(system->name, "ftrace") == 0) continue; pr_info("Testing event system %s: ", system->name); ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1); if (WARN_ON_ONCE(ret)) { pr_warn("error enabling system %s\n", system->name); continue; } event_test_stuff(); ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0); if (WARN_ON_ONCE(ret)) { pr_warn("error disabling system %s\n", system->name); continue; } pr_cont("OK\n"); } /* Test with all events enabled */ pr_info("Running tests on all trace events:\n"); pr_info("Testing all events: "); ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1); if (WARN_ON_ONCE(ret)) { pr_warn("error enabling all events\n"); return; } event_test_stuff(); /* reset sysname */ ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0); if (WARN_ON_ONCE(ret)) { pr_warn("error disabling all events\n"); return; } pr_cont("OK\n"); } #ifdef CONFIG_FUNCTION_TRACER static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); static struct trace_event_file event_trace_file __initdata; static void __init function_test_events_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) { struct ring_buffer_event *event; struct ring_buffer *buffer; struct ftrace_entry *entry; unsigned long flags; long disabled; int cpu; int pc; pc = preempt_count(); preempt_disable_notrace(); cpu = raw_smp_processor_id(); disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); if (disabled != 1) goto out; local_save_flags(flags); event = trace_event_buffer_lock_reserve(&buffer, &event_trace_file, TRACE_FN, sizeof(*entry), flags, pc); if (!event) goto out; entry = ring_buffer_event_data(event); entry->ip = ip; entry->parent_ip = parent_ip; event_trigger_unlock_commit(&event_trace_file, buffer, event, entry, flags, pc); out: atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); preempt_enable_notrace(); } static struct ftrace_ops trace_ops __initdata = { .func = function_test_events_call, .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static __init void event_trace_self_test_with_function(void) { int ret; event_trace_file.tr = top_trace_array(); if (WARN_ON(!event_trace_file.tr)) return; ret = register_ftrace_function(&trace_ops); if (WARN_ON(ret < 0)) { pr_info("Failed to enable function tracer for event tests\n"); return; } pr_info("Running tests again, along with the function tracer\n"); event_trace_self_tests(); unregister_ftrace_function(&trace_ops); } #else static __init void event_trace_self_test_with_function(void) { } #endif static __init int event_trace_self_tests_init(void) { if (!tracing_selftest_disabled) { event_trace_self_tests(); event_trace_self_test_with_function(); } return 0; } late_initcall(event_trace_self_tests_init); #endif |
22 22 22 3 1 21 40 40 36 33 1 14 22 15 21 13 14 14 22 21 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 | /* * Copyright (C) International Business Machines Corp., 2000-2002 * Portions Copyright (C) Christoph Hellwig, 2001-2002 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/mm.h> #include <linux/fs.h> #include <linux/posix_acl.h> #include <linux/quotaops.h> #include "jfs_incore.h" #include "jfs_inode.h" #include "jfs_dmap.h" #include "jfs_txnmgr.h" #include "jfs_xattr.h" #include "jfs_acl.h" #include "jfs_debug.h" int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; int rc = 0; rc = file_write_and_wait_range(file, start, end); if (rc) return rc; inode_lock(inode); if (!(inode->i_state & I_DIRTY_ALL) || (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) { /* Make sure committed changes hit the disk */ jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1); inode_unlock(inode); return rc; } rc |= jfs_commit_inode(inode, 1); inode_unlock(inode); return rc ? -EIO : 0; } static int jfs_open(struct inode *inode, struct file *file) { int rc; if ((rc = dquot_file_open(inode, file))) return rc; /* * We attempt to allow only one "active" file open per aggregate * group. Otherwise, appending to files in parallel can cause * fragmentation within the files. * * If the file is empty, it was probably just created and going * to be written to. If it has a size, we'll hold off until the * file is actually grown. */ if (S_ISREG(inode->i_mode) && file->f_mode & FMODE_WRITE && (inode->i_size == 0)) { struct jfs_inode_info *ji = JFS_IP(inode); spin_lock_irq(&ji->ag_lock); if (ji->active_ag == -1) { struct jfs_sb_info *jfs_sb = JFS_SBI(inode->i_sb); ji->active_ag = BLKTOAG(addressPXD(&ji->ixpxd), jfs_sb); atomic_inc(&jfs_sb->bmap->db_active[ji->active_ag]); } spin_unlock_irq(&ji->ag_lock); } return 0; } static int jfs_release(struct inode *inode, struct file *file) { struct jfs_inode_info *ji = JFS_IP(inode); spin_lock_irq(&ji->ag_lock); if (ji->active_ag != -1) { struct bmap *bmap = JFS_SBI(inode->i_sb)->bmap; atomic_dec(&bmap->db_active[ji->active_ag]); ji->active_ag = -1; } spin_unlock_irq(&ji->ag_lock); return 0; } int jfs_setattr(struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); int rc; rc = setattr_prepare(dentry, iattr); if (rc) return rc; if (is_quota_modification(inode, iattr)) { rc = dquot_initialize(inode); if (rc) return rc; } if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) || (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) { rc = dquot_transfer(inode, iattr); if (rc) return rc; } if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size != i_size_read(inode)) { inode_dio_wait(inode); rc = inode_newsize_ok(inode, iattr->ia_size); if (rc) return rc; truncate_setsize(inode, iattr->ia_size); jfs_truncate(inode); } setattr_copy(inode, iattr); mark_inode_dirty(inode); if (iattr->ia_valid & ATTR_MODE) rc = posix_acl_chmod(inode, inode->i_mode); return rc; } const struct inode_operations jfs_file_inode_operations = { .listxattr = jfs_listxattr, .setattr = jfs_setattr, #ifdef CONFIG_JFS_POSIX_ACL .get_acl = jfs_get_acl, .set_acl = jfs_set_acl, #endif }; const struct file_operations jfs_file_operations = { .open = jfs_open, .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fsync = jfs_fsync, .release = jfs_release, .unlocked_ioctl = jfs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = jfs_compat_ioctl, #endif }; |
22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) Qu Wenruo 2017. All rights reserved. */ /* * The module is used to catch unexpected/corrupted tree block data. * Such behavior can be caused either by a fuzzed image or bugs. * * The objective is to do leaf/node validation checks when tree block is read * from disk, and check *every* possible member, so other code won't * need to checking them again. * * Due to the potential and unwanted damage, every checker needs to be * carefully reviewed otherwise so it does not prevent mount of valid images. */ #include "ctree.h" #include "tree-checker.h" #include "disk-io.h" #include "compression.h" #include "volumes.h" /* * Error message should follow the following format: * corrupt <type>: <identifier>, <reason>[, <bad_value>] * * @type: leaf or node * @identifier: the necessary info to locate the leaf/node. * It's recommened to decode key.objecitd/offset if it's * meaningful. * @reason: describe the error * @bad_value: optional, it's recommened to output bad value and its * expected value (range). * * Since comma is used to separate the components, only space is allowed * inside each component. */ /* * Append generic "corrupt leaf/node root=%llu block=%llu slot=%d: " to @fmt. * Allows callers to customize the output. */ __printf(4, 5) __cold static void generic_err(const struct btrfs_fs_info *fs_info, const struct extent_buffer *eb, int slot, const char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; btrfs_crit(fs_info, "corrupt %s: root=%llu block=%llu slot=%d, %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, &vaf); va_end(args); } /* * Customized reporter for extent data item, since its key objectid and * offset has its own meaning. */ __printf(4, 5) __cold static void file_extent_err(const struct btrfs_fs_info *fs_info, const struct extent_buffer *eb, int slot, const char *fmt, ...) { struct btrfs_key key; struct va_format vaf; va_list args; btrfs_item_key_to_cpu(eb, &key, slot); va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; btrfs_crit(fs_info, "corrupt %s: root=%llu block=%llu slot=%d ino=%llu file_offset=%llu, %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, key.objectid, key.offset, &vaf); va_end(args); } /* * Return 0 if the btrfs_file_extent_##name is aligned to @alignment * Else return 1 */ #define CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, name, alignment) \ ({ \ if (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))) \ file_extent_err((fs_info), (leaf), (slot), \ "invalid %s for file extent, have %llu, should be aligned to %u", \ (#name), btrfs_file_extent_##name((leaf), (fi)), \ (alignment)); \ (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))); \ }) static int check_extent_data_item(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, struct btrfs_key *key, int slot) { struct btrfs_file_extent_item *fi; u32 sectorsize = fs_info->sectorsize; u32 item_size = btrfs_item_size_nr(leaf, slot); if (!IS_ALIGNED(key->offset, sectorsize)) { file_extent_err(fs_info, leaf, slot, "unaligned file_offset for file extent, have %llu should be aligned to %u", key->offset, sectorsize); return -EUCLEAN; } fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); if (btrfs_file_extent_type(leaf, fi) > BTRFS_FILE_EXTENT_TYPES) { file_extent_err(fs_info, leaf, slot, "invalid type for file extent, have %u expect range [0, %u]", btrfs_file_extent_type(leaf, fi), BTRFS_FILE_EXTENT_TYPES); return -EUCLEAN; } /* * Support for new compression/encrption must introduce incompat flag, * and must be caught in open_ctree(). */ if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) { file_extent_err(fs_info, leaf, slot, "invalid compression for file extent, have %u expect range [0, %u]", btrfs_file_extent_compression(leaf, fi), BTRFS_COMPRESS_TYPES); return -EUCLEAN; } if (btrfs_file_extent_encryption(leaf, fi)) { file_extent_err(fs_info, leaf, slot, "invalid encryption for file extent, have %u expect 0", btrfs_file_extent_encryption(leaf, fi)); return -EUCLEAN; } if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) { /* Inline extent must have 0 as key offset */ if (key->offset) { file_extent_err(fs_info, leaf, slot, "invalid file_offset for inline file extent, have %llu expect 0", key->offset); return -EUCLEAN; } /* Compressed inline extent has no on-disk size, skip it */ if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) return 0; /* Uncompressed inline extent size must match item size */ if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START + btrfs_file_extent_ram_bytes(leaf, fi)) { file_extent_err(fs_info, leaf, slot, "invalid ram_bytes for uncompressed inline extent, have %u expect %llu", item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START + btrfs_file_extent_ram_bytes(leaf, fi)); return -EUCLEAN; } return 0; } /* Regular or preallocated extent has fixed item size */ if (item_size != sizeof(*fi)) { file_extent_err(fs_info, leaf, slot, "invalid item size for reg/prealloc file extent, have %u expect %zu", item_size, sizeof(*fi)); return -EUCLEAN; } if (CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, ram_bytes, sectorsize) || CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, disk_bytenr, sectorsize) || CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, disk_num_bytes, sectorsize) || CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, offset, sectorsize) || CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, num_bytes, sectorsize)) return -EUCLEAN; return 0; } static int check_csum_item(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, struct btrfs_key *key, int slot) { u32 sectorsize = fs_info->sectorsize; u32 csumsize = btrfs_super_csum_size(fs_info->super_copy); if (key->objectid != BTRFS_EXTENT_CSUM_OBJECTID) { generic_err(fs_info, leaf, slot, "invalid key objectid for csum item, have %llu expect %llu", key->objectid, BTRFS_EXTENT_CSUM_OBJECTID); return -EUCLEAN; } if (!IS_ALIGNED(key->offset, sectorsize)) { generic_err(fs_info, leaf, slot, "unaligned key offset for csum item, have %llu should be aligned to %u", key->offset, sectorsize); return -EUCLEAN; } if (!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize)) { generic_err(fs_info, leaf, slot, "unaligned item size for csum item, have %u should be aligned to %u", btrfs_item_size_nr(leaf, slot), csumsize); return -EUCLEAN; } return 0; } /* * Customized reported for dir_item, only important new info is key->objectid, * which represents inode number */ __printf(4, 5) __cold static void dir_item_err(const struct btrfs_fs_info *fs_info, const struct extent_buffer *eb, int slot, const char *fmt, ...) { struct btrfs_key key; struct va_format vaf; va_list args; btrfs_item_key_to_cpu(eb, &key, slot); va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; btrfs_crit(fs_info, "corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, key.objectid, &vaf); va_end(args); } static int check_dir_item(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, struct btrfs_key *key, int slot) { struct btrfs_dir_item *di; u32 item_size = btrfs_item_size_nr(leaf, slot); u32 cur = 0; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); while (cur < item_size) { u32 name_len; u32 data_len; u32 max_name_len; u32 total_size; u32 name_hash; u8 dir_type; /* header itself should not cross item boundary */ if (cur + sizeof(*di) > item_size) { dir_item_err(fs_info, leaf, slot, "dir item header crosses item boundary, have %zu boundary %u", cur + sizeof(*di), item_size); return -EUCLEAN; } /* dir type check */ dir_type = btrfs_dir_type(leaf, di); if (dir_type >= BTRFS_FT_MAX) { dir_item_err(fs_info, leaf, slot, "invalid dir item type, have %u expect [0, %u)", dir_type, BTRFS_FT_MAX); return -EUCLEAN; } if (key->type == BTRFS_XATTR_ITEM_KEY && dir_type != BTRFS_FT_XATTR) { dir_item_err(fs_info, leaf, slot, "invalid dir item type for XATTR key, have %u expect %u", dir_type, BTRFS_FT_XATTR); return -EUCLEAN; } if (dir_type == BTRFS_FT_XATTR && key->type != BTRFS_XATTR_ITEM_KEY) { dir_item_err(fs_info, leaf, slot, "xattr dir type found for non-XATTR key"); return -EUCLEAN; } if (dir_type == BTRFS_FT_XATTR) max_name_len = XATTR_NAME_MAX; else max_name_len = BTRFS_NAME_LEN; /* Name/data length check */ name_len = btrfs_dir_name_len(leaf, di); data_len = btrfs_dir_data_len(leaf, di); if (name_len > max_name_len) { dir_item_err(fs_info, leaf, slot, "dir item name len too long, have %u max %u", name_len, max_name_len); return -EUCLEAN; } if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(fs_info)) { dir_item_err(fs_info, leaf, slot, "dir item name and data len too long, have %u max %u", name_len + data_len, BTRFS_MAX_XATTR_SIZE(fs_info)); return -EUCLEAN; } if (data_len && dir_type != BTRFS_FT_XATTR) { dir_item_err(fs_info, leaf, slot, "dir item with invalid data len, have %u expect 0", data_len); return -EUCLEAN; } total_size = sizeof(*di) + name_len + data_len; /* header and name/data should not cross item boundary */ if (cur + total_size > item_size) { dir_item_err(fs_info, leaf, slot, "dir item data crosses item boundary, have %u boundary %u", cur + total_size, item_size); return -EUCLEAN; } /* * Special check for XATTR/DIR_ITEM, as key->offset is name * hash, should match its name */ if (key->type == BTRFS_DIR_ITEM_KEY || key->type == BTRFS_XATTR_ITEM_KEY) { char namebuf[max(BTRFS_NAME_LEN, XATTR_NAME_MAX)]; read_extent_buffer(leaf, namebuf, (unsigned long)(di + 1), name_len); name_hash = btrfs_name_hash(namebuf, name_len); if (key->offset != name_hash) { dir_item_err(fs_info, leaf, slot, "name hash mismatch with key, have 0x%016x expect 0x%016llx", name_hash, key->offset); return -EUCLEAN; } } cur += total_size; di = (struct btrfs_dir_item *)((void *)di + total_size); } return 0; } __printf(4, 5) __cold static void block_group_err(const struct btrfs_fs_info *fs_info, const struct extent_buffer *eb, int slot, const char *fmt, ...) { struct btrfs_key key; struct va_format vaf; va_list args; btrfs_item_key_to_cpu(eb, &key, slot); va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; btrfs_crit(fs_info, "corrupt %s: root=%llu block=%llu slot=%d bg_start=%llu bg_len=%llu, %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, key.objectid, key.offset, &vaf); va_end(args); } static int check_block_group_item(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, struct btrfs_key *key, int slot) { struct btrfs_block_group_item bgi; u32 item_size = btrfs_item_size_nr(leaf, slot); u64 flags; u64 type; /* * Here we don't really care about alignment since extent allocator can * handle it. We care more about the size. */ if (key->offset == 0) { block_group_err(fs_info, leaf, slot, "invalid block group size 0"); return -EUCLEAN; } if (item_size != sizeof(bgi)) { block_group_err(fs_info, leaf, slot, "invalid item size, have %u expect %zu", item_size, sizeof(bgi)); return -EUCLEAN; } read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), sizeof(bgi)); if (btrfs_block_group_chunk_objectid(&bgi) != BTRFS_FIRST_CHUNK_TREE_OBJECTID) { block_group_err(fs_info, leaf, slot, "invalid block group chunk objectid, have %llu expect %llu", btrfs_block_group_chunk_objectid(&bgi), BTRFS_FIRST_CHUNK_TREE_OBJECTID); return -EUCLEAN; } if (btrfs_block_group_used(&bgi) > key->offset) { block_group_err(fs_info, leaf, slot, "invalid block group used, have %llu expect [0, %llu)", btrfs_block_group_used(&bgi), key->offset); return -EUCLEAN; } flags = btrfs_block_group_flags(&bgi); if (hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) > 1) { block_group_err(fs_info, leaf, slot, "invalid profile flags, have 0x%llx (%lu bits set) expect no more than 1 bit set", flags & BTRFS_BLOCK_GROUP_PROFILE_MASK, hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)); return -EUCLEAN; } type = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; if (type != BTRFS_BLOCK_GROUP_DATA && type != BTRFS_BLOCK_GROUP_METADATA && type != BTRFS_BLOCK_GROUP_SYSTEM && type != (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA)) { block_group_err(fs_info, leaf, slot, "invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx", type, hweight64(type), BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA, BTRFS_BLOCK_GROUP_SYSTEM, BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA); return -EUCLEAN; } return 0; } __printf(5, 6) __cold static void chunk_err(const struct btrfs_fs_info *fs_info, const struct extent_buffer *leaf, const struct btrfs_chunk *chunk, u64 logical, const char *fmt, ...) { bool is_sb; struct va_format vaf; va_list args; int i; int slot = -1; /* Only superblock eb is able to have such small offset */ is_sb = (leaf->start == BTRFS_SUPER_INFO_OFFSET); if (!is_sb) { /* * Get the slot number by iterating through all slots, this * would provide better readability. */ for (i = 0; i < btrfs_header_nritems(leaf); i++) { if (btrfs_item_ptr_offset(leaf, i) == (unsigned long)chunk) { slot = i; break; } } } va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; if (is_sb) btrfs_crit(fs_info, "corrupt superblock syschunk array: chunk_start=%llu, %pV", logical, &vaf); else btrfs_crit(fs_info, "corrupt leaf: root=%llu block=%llu slot=%d chunk_start=%llu, %pV", BTRFS_CHUNK_TREE_OBJECTID, leaf->start, slot, logical, &vaf); va_end(args); } /* * The common chunk check which could also work on super block sys chunk array. * * Return -EUCLEAN if anything is corrupted. * Return 0 if everything is OK. */ int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, struct btrfs_chunk *chunk, u64 logical) { u64 length; u64 stripe_len; u16 num_stripes; u16 sub_stripes; u64 type; u64 features; bool mixed = false; length = btrfs_chunk_length(leaf, chunk); stripe_len = btrfs_chunk_stripe_len(leaf, chunk); num_stripes = btrfs_chunk_num_stripes(leaf, chunk); sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); type = btrfs_chunk_type(leaf, chunk); if (!num_stripes) { chunk_err(fs_info, leaf, chunk, logical, "invalid chunk num_stripes, have %u", num_stripes); return -EUCLEAN; } if (!IS_ALIGNED(logical, fs_info->sectorsize)) { chunk_err(fs_info, leaf, chunk, logical, "invalid chunk logical, have %llu should aligned to %u", logical, fs_info->sectorsize); return -EUCLEAN; } if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) { chunk_err(fs_info, leaf, chunk, logical, "invalid chunk sectorsize, have %u expect %u", btrfs_chunk_sector_size(leaf, chunk), fs_info->sectorsize); return -EUCLEAN; } if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) { chunk_err(fs_info, leaf, chunk, logical, "invalid chunk length, have %llu", length); return -EUCLEAN; } if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) { chunk_err(fs_info, leaf, chunk, logical, "invalid chunk stripe length: %llu", stripe_len); return -EUCLEAN; } if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & type) { chunk_err(fs_info, leaf, chunk, logical, "unrecognized chunk type: 0x%llx", ~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & btrfs_chunk_type(leaf, chunk)); return -EUCLEAN; } if (!is_power_of_2(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) && (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0) { chunk_err(fs_info, leaf, chunk, logical, "invalid chunk profile flag: 0x%llx, expect 0 or 1 bit set", type & BTRFS_BLOCK_GROUP_PROFILE_MASK); return -EUCLEAN; } if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) { chunk_err(fs_info, leaf, chunk, logical, "missing chunk type flag, have 0x%llx one bit must be set in 0x%llx", type, BTRFS_BLOCK_GROUP_TYPE_MASK); return -EUCLEAN; } if ((type & BTRFS_BLOCK_GROUP_SYSTEM) && (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) { chunk_err(fs_info, leaf, chunk, logical, "system chunk with data or metadata type: 0x%llx", type); return -EUCLEAN; } features = btrfs_super_incompat_flags(fs_info->super_copy); if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) mixed = true; if (!mixed) { if ((type & BTRFS_BLOCK_GROUP_METADATA) && (type & BTRFS_BLOCK_GROUP_DATA)) { chunk_err(fs_info, leaf, chunk, logical, "mixed chunk type in non-mixed mode: 0x%llx", type); return -EUCLEAN; } } if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) || (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) || ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && num_stripes != 1)) { chunk_err(fs_info, leaf, chunk, logical, "invalid num_stripes:sub_stripes %u:%u for profile %llu", num_stripes, sub_stripes, type & BTRFS_BLOCK_GROUP_PROFILE_MASK); return -EUCLEAN; } return 0; } __printf(4, 5) __cold static void dev_item_err(const struct btrfs_fs_info *fs_info, const struct extent_buffer *eb, int slot, const char *fmt, ...) { struct btrfs_key key; struct va_format vaf; va_list args; btrfs_item_key_to_cpu(eb, &key, slot); va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; btrfs_crit(fs_info, "corrupt %s: root=%llu block=%llu slot=%d devid=%llu %pV", btrfs_header_level(eb) == 0 ? "leaf" : "node", btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, key.objectid, &vaf); va_end(args); } static int check_dev_item(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, struct btrfs_key *key, int slot) { struct btrfs_dev_item *ditem; if (key->objectid != BTRFS_DEV_ITEMS_OBJECTID) { dev_item_err(fs_info, leaf, slot, "invalid objectid: has=%llu expect=%llu", key->objectid, BTRFS_DEV_ITEMS_OBJECTID); return -EUCLEAN; } ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item); if (btrfs_device_id(leaf, ditem) != key->offset) { dev_item_err(fs_info, leaf, slot, "devid mismatch: key has=%llu item has=%llu", key->offset, btrfs_device_id(leaf, ditem)); return -EUCLEAN; } /* * For device total_bytes, we don't have reliable way to check it, as * it can be 0 for device removal. Device size check can only be done * by dev extents check. */ if (btrfs_device_bytes_used(leaf, ditem) > btrfs_device_total_bytes(leaf, ditem)) { dev_item_err(fs_info, leaf, slot, "invalid bytes used: have %llu expect [0, %llu]", btrfs_device_bytes_used(leaf, ditem), btrfs_device_total_bytes(leaf, ditem)); return -EUCLEAN; } /* * Remaining members like io_align/type/gen/dev_group aren't really * utilized. Skip them to make later usage of them easier. */ return 0; } /* Inode item error output has the same format as dir_item_err() */ #define inode_item_err(fs_info, eb, slot, fmt, ...) \ dir_item_err(fs_info, eb, slot, fmt, __VA_ARGS__) static int check_inode_item(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, struct btrfs_key *key, int slot) { struct btrfs_inode_item *iitem; u64 super_gen = btrfs_super_generation(fs_info->super_copy); u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777); u32 mode; if ((key->objectid < BTRFS_FIRST_FREE_OBJECTID || key->objectid > BTRFS_LAST_FREE_OBJECTID) && key->objectid != BTRFS_ROOT_TREE_DIR_OBJECTID && key->objectid != BTRFS_FREE_INO_OBJECTID) { generic_err(fs_info, leaf, slot, "invalid key objectid: has %llu expect %llu or [%llu, %llu] or %llu", key->objectid, BTRFS_ROOT_TREE_DIR_OBJECTID, BTRFS_FIRST_FREE_OBJECTID, BTRFS_LAST_FREE_OBJECTID, BTRFS_FREE_INO_OBJECTID); return -EUCLEAN; } if (key->offset != 0) { inode_item_err(fs_info, leaf, slot, "invalid key offset: has %llu expect 0", key->offset); return -EUCLEAN; } iitem = btrfs_item_ptr(leaf, slot, struct btrfs_inode_item); /* Here we use super block generation + 1 to handle log tree */ if (btrfs_inode_generation(leaf, iitem) > super_gen + 1) { inode_item_err(fs_info, leaf, slot, "invalid inode generation: has %llu expect (0, %llu]", btrfs_inode_generation(leaf, iitem), super_gen + 1); return -EUCLEAN; } /* Note for ROOT_TREE_DIR_ITEM, mkfs could set its transid 0 */ if (btrfs_inode_transid(leaf, iitem) > super_gen + 1) { inode_item_err(fs_info, leaf, slot, "invalid inode transid: has %llu expect [0, %llu]", btrfs_inode_transid(leaf, iitem), super_gen + 1); return -EUCLEAN; } /* * For size and nbytes it's better not to be too strict, as for dir * item its size/nbytes can easily get wrong, but doesn't affect * anything in the fs. So here we skip the check. */ mode = btrfs_inode_mode(leaf, iitem); if (mode & ~valid_mask) { inode_item_err(fs_info, leaf, slot, "unknown mode bit detected: 0x%x", mode & ~valid_mask); return -EUCLEAN; } /* * S_IFMT is not bit mapped so we can't completely rely on is_power_of_2, * but is_power_of_2() can save us from checking FIFO/CHR/DIR/REG. * Only needs to check BLK, LNK and SOCKS */ if (!is_power_of_2(mode & S_IFMT)) { if (!S_ISLNK(mode) && !S_ISBLK(mode) && !S_ISSOCK(mode)) { inode_item_err(fs_info, leaf, slot, "invalid mode: has 0%o expect valid S_IF* bit(s)", mode & S_IFMT); return -EUCLEAN; } } if (S_ISDIR(mode) && btrfs_inode_nlink(leaf, iitem) > 1) { inode_item_err(fs_info, leaf, slot, "invalid nlink: has %u expect no more than 1 for dir", btrfs_inode_nlink(leaf, iitem)); return -EUCLEAN; } if (btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK) { inode_item_err(fs_info, leaf, slot, "unknown flags detected: 0x%llx", btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK); return -EUCLEAN; } return 0; } /* * Common point to switch the item-specific validation. */ static int check_leaf_item(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, struct btrfs_key *key, int slot) { int ret = 0; struct btrfs_chunk *chunk; switch (key->type) { case BTRFS_EXTENT_DATA_KEY: ret = check_extent_data_item(fs_info, leaf, key, slot); break; case BTRFS_EXTENT_CSUM_KEY: ret = check_csum_item(fs_info, leaf, key, slot); break; case BTRFS_DIR_ITEM_KEY: case BTRFS_DIR_INDEX_KEY: case BTRFS_XATTR_ITEM_KEY: ret = check_dir_item(fs_info, leaf, key, slot); break; case BTRFS_BLOCK_GROUP_ITEM_KEY: ret = check_block_group_item(fs_info, leaf, key, slot); break; case BTRFS_CHUNK_ITEM_KEY: chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, key->offset); break; case BTRFS_DEV_ITEM_KEY: ret = check_dev_item(fs_info, leaf, key, slot); break; case BTRFS_INODE_ITEM_KEY: ret = check_inode_item(fs_info, leaf, key, slot); break; } return ret; } static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, bool check_item_data) { /* No valid key type is 0, so all key should be larger than this key */ struct btrfs_key prev_key = {0, 0, 0}; struct btrfs_key key; u32 nritems = btrfs_header_nritems(leaf); int slot; if (btrfs_header_level(leaf) != 0) { generic_err(fs_info, leaf, 0, "invalid level for leaf, have %d expect 0", btrfs_header_level(leaf)); return -EUCLEAN; } /* * Extent buffers from a relocation tree have a owner field that * corresponds to the subvolume tree they are based on. So just from an * extent buffer alone we can not find out what is the id of the * corresponding subvolume tree, so we can not figure out if the extent * buffer corresponds to the root of the relocation tree or not. So * skip this check for relocation trees. */ if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) { u64 owner = btrfs_header_owner(leaf); struct btrfs_root *check_root; /* These trees must never be empty */ if (owner == BTRFS_ROOT_TREE_OBJECTID || owner == BTRFS_CHUNK_TREE_OBJECTID || owner == BTRFS_EXTENT_TREE_OBJECTID || owner == BTRFS_DEV_TREE_OBJECTID || owner == BTRFS_FS_TREE_OBJECTID || owner == BTRFS_DATA_RELOC_TREE_OBJECTID) { generic_err(fs_info, leaf, 0, "invalid root, root %llu must never be empty", owner); return -EUCLEAN; } /* Unknown tree */ if (owner == 0) { generic_err(fs_info, leaf, 0, "invalid owner, root 0 is not defined"); return -EUCLEAN; } key.objectid = owner; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; check_root = btrfs_get_fs_root(fs_info, &key, false); /* * The only reason we also check NULL here is that during * open_ctree() some roots has not yet been set up. */ if (!IS_ERR_OR_NULL(check_root)) { struct extent_buffer *eb; eb = btrfs_root_node(check_root); /* if leaf is the root, then it's fine */ if (leaf != eb) { generic_err(fs_info, leaf, 0, "invalid nritems, have %u should not be 0 for non-root leaf", nritems); free_extent_buffer(eb); return -EUCLEAN; } free_extent_buffer(eb); } return 0; } if (nritems == 0) return 0; /* * Check the following things to make sure this is a good leaf, and * leaf users won't need to bother with similar sanity checks: * * 1) key ordering * 2) item offset and size * No overlap, no hole, all inside the leaf. * 3) item content * If possible, do comprehensive sanity check. * NOTE: All checks must only rely on the item data itself. */ for (slot = 0; slot < nritems; slot++) { u32 item_end_expected; int ret; btrfs_item_key_to_cpu(leaf, &key, slot); /* Make sure the keys are in the right order */ if (btrfs_comp_cpu_keys(&prev_key, &key) >= 0) { generic_err(fs_info, leaf, slot, "bad key order, prev (%llu %u %llu) current (%llu %u %llu)", prev_key.objectid, prev_key.type, prev_key.offset, key.objectid, key.type, key.offset); return -EUCLEAN; } /* * Make sure the offset and ends are right, remember that the * item data starts at the end of the leaf and grows towards the * front. */ if (slot == 0) item_end_expected = BTRFS_LEAF_DATA_SIZE(fs_info); else item_end_expected = btrfs_item_offset_nr(leaf, slot - 1); if (btrfs_item_end_nr(leaf, slot) != item_end_expected) { generic_err(fs_info, leaf, slot, "unexpected item end, have %u expect %u", btrfs_item_end_nr(leaf, slot), item_end_expected); return -EUCLEAN; } /* * Check to make sure that we don't point outside of the leaf, * just in case all the items are consistent to each other, but * all point outside of the leaf. */ if (btrfs_item_end_nr(leaf, slot) > BTRFS_LEAF_DATA_SIZE(fs_info)) { generic_err(fs_info, leaf, slot, "slot end outside of leaf, have %u expect range [0, %u]", btrfs_item_end_nr(leaf, slot), BTRFS_LEAF_DATA_SIZE(fs_info)); return -EUCLEAN; } /* Also check if the item pointer overlaps with btrfs item. */ if (btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item) > btrfs_item_ptr_offset(leaf, slot)) { generic_err(fs_info, leaf, slot, "slot overlaps with its data, item end %lu data start %lu", btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item), btrfs_item_ptr_offset(leaf, slot)); return -EUCLEAN; } if (check_item_data) { /* * Check if the item size and content meet other * criteria */ ret = check_leaf_item(fs_info, leaf, &key, slot); if (ret < 0) return ret; } prev_key.objectid = key.objectid; prev_key.type = key.type; prev_key.offset = key.offset; } return 0; } int btrfs_check_leaf_full(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf) { return check_leaf(fs_info, leaf, true); } int btrfs_check_leaf_relaxed(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf) { return check_leaf(fs_info, leaf, false); } int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node) { unsigned long nr = btrfs_header_nritems(node); struct btrfs_key key, next_key; int slot; int level = btrfs_header_level(node); u64 bytenr; int ret = 0; if (level <= 0 || level >= BTRFS_MAX_LEVEL) { generic_err(fs_info, node, 0, "invalid level for node, have %d expect [1, %d]", level, BTRFS_MAX_LEVEL - 1); return -EUCLEAN; } if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(fs_info)) { btrfs_crit(fs_info, "corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%u]", btrfs_header_owner(node), node->start, nr == 0 ? "small" : "large", nr, BTRFS_NODEPTRS_PER_BLOCK(fs_info)); return -EUCLEAN; } for (slot = 0; slot < nr - 1; slot++) { bytenr = btrfs_node_blockptr(node, slot); btrfs_node_key_to_cpu(node, &key, slot); btrfs_node_key_to_cpu(node, &next_key, slot + 1); if (!bytenr) { generic_err(fs_info, node, slot, "invalid NULL node pointer"); ret = -EUCLEAN; goto out; } if (!IS_ALIGNED(bytenr, fs_info->sectorsize)) { generic_err(fs_info, node, slot, "unaligned pointer, have %llu should be aligned to %u", bytenr, fs_info->sectorsize); ret = -EUCLEAN; goto out; } if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) { generic_err(fs_info, node, slot, "bad key order, current (%llu %u %llu) next (%llu %u %llu)", key.objectid, key.type, key.offset, next_key.objectid, next_key.type, next_key.offset); ret = -EUCLEAN; goto out; } } out: return ret; } |
240 133 208 209 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 | /* * V9FS definitions. * * Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com> * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to: * Free Software Foundation * 51 Franklin Street, Fifth Floor * Boston, MA 02111-1301 USA * */ #ifndef FS_9P_V9FS_H #define FS_9P_V9FS_H #include <linux/backing-dev.h> /** * enum p9_session_flags - option flags for each 9P session * @V9FS_PROTO_2000U: whether or not to use 9P2000.u extensions * @V9FS_PROTO_2000L: whether or not to use 9P2000.l extensions * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy * @V9FS_ACCESS_USER: a new attach will be issued for every user (default) * @V9FS_ACCESS_CLIENT: Just like user, but access check is performed on client. * @V9FS_ACCESS_ANY: use a single attach for all users * @V9FS_ACCESS_MASK: bit mask of different ACCESS options * @V9FS_POSIX_ACL: POSIX ACLs are enforced * * Session flags reflect options selected by users at mount time */ #define V9FS_ACCESS_ANY (V9FS_ACCESS_SINGLE | \ V9FS_ACCESS_USER | \ V9FS_ACCESS_CLIENT) #define V9FS_ACCESS_MASK V9FS_ACCESS_ANY #define V9FS_ACL_MASK V9FS_POSIX_ACL enum p9_session_flags { V9FS_PROTO_2000U = 0x01, V9FS_PROTO_2000L = 0x02, V9FS_ACCESS_SINGLE = 0x04, V9FS_ACCESS_USER = 0x08, V9FS_ACCESS_CLIENT = 0x10, V9FS_POSIX_ACL = 0x20 }; /* possible values of ->cache */ /** * enum p9_cache_modes - user specified cache preferences * @CACHE_NONE: do not cache data, dentries, or directory contents (default) * @CACHE_LOOSE: cache data, dentries, and directory contents w/no consistency * * eventually support loose, tight, time, session, default always none */ enum p9_cache_modes { CACHE_NONE, CACHE_MMAP, CACHE_LOOSE, CACHE_FSCACHE, nr__p9_cache_modes }; /** * struct v9fs_session_info - per-instance session information * @flags: session options of type &p9_session_flags * @nodev: set to 1 to disable device mapping * @debug: debug level * @afid: authentication handle * @cache: cache mode of type &p9_cache_modes * @cachetag: the tag of the cache associated with this session * @fscache: session cookie associated with FS-Cache * @uname: string user name to mount hierarchy as * @aname: mount specifier for remote hierarchy * @maxdata: maximum data to be sent/recvd per protocol message * @dfltuid: default numeric userid to mount hierarchy as * @dfltgid: default numeric groupid to mount hierarchy as * @uid: if %V9FS_ACCESS_SINGLE, the numeric uid which mounted the hierarchy * @clnt: reference to 9P network client instantiated for this session * @slist: reference to list of registered 9p sessions * * This structure holds state for each session instance established during * a sys_mount() . * * Bugs: there seems to be a lot of state which could be condensed and/or * removed. */ struct v9fs_session_info { /* options */ unsigned char flags; unsigned char nodev; unsigned short debug; unsigned int afid; unsigned int cache; #ifdef CONFIG_9P_FSCACHE char *cachetag; struct fscache_cookie *fscache; #endif char *uname; /* user name to mount as */ char *aname; /* name of remote hierarchy being mounted */ unsigned int maxdata; /* max data for client interface */ kuid_t dfltuid; /* default uid/muid for legacy support */ kgid_t dfltgid; /* default gid for legacy support */ kuid_t uid; /* if ACCESS_SINGLE, the uid that has access */ struct p9_client *clnt; /* 9p client */ struct list_head slist; /* list of sessions registered with v9fs */ struct rw_semaphore rename_sem; long session_lock_timeout; /* retry interval for blocking locks */ }; /* cache_validity flags */ #define V9FS_INO_INVALID_ATTR 0x01 struct v9fs_inode { #ifdef CONFIG_9P_FSCACHE struct mutex fscache_lock; struct fscache_cookie *fscache; #endif struct p9_qid qid; unsigned int cache_validity; struct p9_fid *writeback_fid; struct mutex v_mutex; struct inode vfs_inode; }; static inline struct v9fs_inode *V9FS_I(const struct inode *inode) { return container_of(inode, struct v9fs_inode, vfs_inode); } extern int v9fs_show_options(struct seq_file *m, struct dentry *root); struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *, char *); extern void v9fs_session_close(struct v9fs_session_info *v9ses); extern void v9fs_session_cancel(struct v9fs_session_info *v9ses); extern void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses); extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d); extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d); extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, struct super_block *sb, int new); extern const struct inode_operations v9fs_dir_inode_operations_dotl; extern const struct inode_operations v9fs_file_inode_operations_dotl; extern const struct inode_operations v9fs_symlink_inode_operations_dotl; extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid, struct super_block *sb, int new); /* other default globals */ #define V9FS_PORT 564 #define V9FS_DEFUSER "nobody" #define V9FS_DEFANAME "" #define V9FS_DEFUID KUIDT_INIT(-2) #define V9FS_DEFGID KGIDT_INIT(-2) static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode) { return (inode->i_sb->s_fs_info); } static inline struct v9fs_session_info *v9fs_dentry2v9ses(struct dentry *dentry) { return dentry->d_sb->s_fs_info; } static inline int v9fs_proto_dotu(struct v9fs_session_info *v9ses) { return v9ses->flags & V9FS_PROTO_2000U; } static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses) { return v9ses->flags & V9FS_PROTO_2000L; } /** * v9fs_get_inode_from_fid - Helper routine to populate an inode by * issuing a attribute request * @v9ses: session information * @fid: fid to issue attribute request for * @sb: superblock on which to create inode * */ static inline struct inode * v9fs_get_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, struct super_block *sb) { if (v9fs_proto_dotl(v9ses)) return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 0); else return v9fs_inode_from_fid(v9ses, fid, sb, 0); } /** * v9fs_get_new_inode_from_fid - Helper routine to populate an inode by * issuing a attribute request * @v9ses: session information * @fid: fid to issue attribute request for * @sb: superblock on which to create inode * */ static inline struct inode * v9fs_get_new_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, struct super_block *sb) { if (v9fs_proto_dotl(v9ses)) return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 1); else return v9fs_inode_from_fid(v9ses, fid, sb, 1); } #endif |
148 149 149 3 3 149 149 149 149 60 23 23 23 22 60 40 60 4 4 60 59 59 59 38 59 21 21 59 26 26 26 26 7 22 22 22 24 6 4 4 1 1 3 3 3 3 4 26 26 23 23 1 22 19 3 22 22 22 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/brec.c * * Copyright (C) 2001 * Brad Boyer (flar@allandria.com) * (C) 2003 Ardis Technologies <roman@ardistech.com> * * Handle individual btree records */ #include "hfsplus_fs.h" #include "hfsplus_raw.h" static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd); static int hfs_brec_update_parent(struct hfs_find_data *fd); static int hfs_btree_inc_height(struct hfs_btree *); /* Get the length and offset of the given record in the given node */ u16 hfs_brec_lenoff(struct hfs_bnode *node, u16 rec, u16 *off) { __be16 retval[2]; u16 dataoff; dataoff = node->tree->node_size - (rec + 2) * 2; hfs_bnode_read(node, retval, dataoff, 4); *off = be16_to_cpu(retval[1]); return be16_to_cpu(retval[0]) - *off; } /* Get the length of the key from a keyed record */ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec) { u16 retval, recoff; if (node->type != HFS_NODE_INDEX && node->type != HFS_NODE_LEAF) return 0; if ((node->type == HFS_NODE_INDEX) && !(node->tree->attributes & HFS_TREE_VARIDXKEYS) && (node->tree->cnid != HFSPLUS_ATTR_CNID)) { retval = node->tree->max_key_len + 2; } else { recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2); if (!recoff) return 0; if (recoff > node->tree->node_size - 2) { pr_err("recoff %d too large\n", recoff); return 0; } retval = hfs_bnode_read_u16(node, recoff) + 2; if (retval > node->tree->max_key_len + 2) { pr_err("keylen %d too large\n", retval); retval = 0; } } return retval; } int hfs_brec_insert(struct hfs_find_data *fd, void *entry, int entry_len) { struct hfs_btree *tree; struct hfs_bnode *node, *new_node; int size, key_len, rec; int data_off, end_off; int idx_rec_off, data_rec_off, end_rec_off; __be32 cnid; tree = fd->tree; if (!fd->bnode) { if (!tree->root) hfs_btree_inc_height(tree); node = hfs_bnode_find(tree, tree->leaf_head); if (IS_ERR(node)) return PTR_ERR(node); fd->bnode = node; fd->record = -1; } new_node = NULL; key_len = be16_to_cpu(fd->search_key->key_len) + 2; again: /* new record idx and complete record size */ rec = fd->record + 1; size = key_len + entry_len; node = fd->bnode; hfs_bnode_dump(node); /* get last offset */ end_rec_off = tree->node_size - (node->num_recs + 1) * 2; end_off = hfs_bnode_read_u16(node, end_rec_off); end_rec_off -= 2; hfs_dbg(BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", rec, size, end_off, end_rec_off); if (size > end_rec_off - end_off) { if (new_node) panic("not enough room!\n"); new_node = hfs_bnode_split(fd); if (IS_ERR(new_node)) return PTR_ERR(new_node); goto again; } if (node->type == HFS_NODE_LEAF) { tree->leaf_count++; mark_inode_dirty(tree->inode); } node->num_recs++; /* write new last offset */ hfs_bnode_write_u16(node, offsetof(struct hfs_bnode_desc, num_recs), node->num_recs); hfs_bnode_write_u16(node, end_rec_off, end_off + size); data_off = end_off; data_rec_off = end_rec_off + 2; idx_rec_off = tree->node_size - (rec + 1) * 2; if (idx_rec_off == data_rec_off) goto skip; /* move all following entries */ do { data_off = hfs_bnode_read_u16(node, data_rec_off + 2); hfs_bnode_write_u16(node, data_rec_off, data_off + size); data_rec_off += 2; } while (data_rec_off < idx_rec_off); /* move data away */ hfs_bnode_move(node, data_off + size, data_off, end_off - data_off); skip: hfs_bnode_write(node, fd->search_key, data_off, key_len); hfs_bnode_write(node, entry, data_off + key_len, entry_len); hfs_bnode_dump(node); /* * update parent key if we inserted a key * at the start of the node and it is not the new node */ if (!rec && new_node != node) { hfs_bnode_read_key(node, fd->search_key, data_off + size); hfs_brec_update_parent(fd); } if (new_node) { hfs_bnode_put(fd->bnode); if (!new_node->parent) { hfs_btree_inc_height(tree); new_node->parent = tree->root; } fd->bnode = hfs_bnode_find(tree, new_node->parent); /* create index data entry */ cnid = cpu_to_be32(new_node->this); entry = &cnid; entry_len = sizeof(cnid); /* get index key */ hfs_bnode_read_key(new_node, fd->search_key, 14); __hfs_brec_find(fd->bnode, fd, hfs_find_rec_by_key); hfs_bnode_put(new_node); new_node = NULL; if ((tree->attributes & HFS_TREE_VARIDXKEYS) || (tree->cnid == HFSPLUS_ATTR_CNID)) key_len = be16_to_cpu(fd->search_key->key_len) + 2; else { fd->search_key->key_len = cpu_to_be16(tree->max_key_len); key_len = tree->max_key_len + 2; } goto again; } return 0; } int hfs_brec_remove(struct hfs_find_data *fd) { struct hfs_btree *tree; struct hfs_bnode *node, *parent; int end_off, rec_off, data_off, size; tree = fd->tree; node = fd->bnode; again: rec_off = tree->node_size - (fd->record + 2) * 2; end_off = tree->node_size - (node->num_recs + 1) * 2; if (node->type == HFS_NODE_LEAF) { tree->leaf_count--; mark_inode_dirty(tree->inode); } hfs_bnode_dump(node); hfs_dbg(BNODE_MOD, "remove_rec: %d, %d\n", fd->record, fd->keylength + fd->entrylength); if (!--node->num_recs) { hfs_bnode_unlink(node); if (!node->parent) return 0; parent = hfs_bnode_find(tree, node->parent); if (IS_ERR(parent)) return PTR_ERR(parent); hfs_bnode_put(node); node = fd->bnode = parent; __hfs_brec_find(node, fd, hfs_find_rec_by_key); goto again; } hfs_bnode_write_u16(node, offsetof(struct hfs_bnode_desc, num_recs), node->num_recs); if (rec_off == end_off) goto skip; size = fd->keylength + fd->entrylength; do { data_off = hfs_bnode_read_u16(node, rec_off); hfs_bnode_write_u16(node, rec_off + 2, data_off - size); rec_off -= 2; } while (rec_off >= end_off); /* fill hole */ hfs_bnode_move(node, fd->keyoffset, fd->keyoffset + size, data_off - fd->keyoffset - size); skip: hfs_bnode_dump(node); if (!fd->record) hfs_brec_update_parent(fd); return 0; } static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) { struct hfs_btree *tree; struct hfs_bnode *node, *new_node, *next_node; struct hfs_bnode_desc node_desc; int num_recs, new_rec_off, new_off, old_rec_off; int data_start, data_end, size; tree = fd->tree; node = fd->bnode; new_node = hfs_bmap_alloc(tree); if (IS_ERR(new_node)) return new_node; hfs_bnode_get(node); hfs_dbg(BNODE_MOD, "split_nodes: %d - %d - %d\n", node->this, new_node->this, node->next); new_node->next = node->next; new_node->prev = node->this; new_node->parent = node->parent; new_node->type = node->type; new_node->height = node->height; if (node->next) next_node = hfs_bnode_find(tree, node->next); else next_node = NULL; if (IS_ERR(next_node)) { hfs_bnode_put(node); hfs_bnode_put(new_node); return next_node; } size = tree->node_size / 2 - node->num_recs * 2 - 14; old_rec_off = tree->node_size - 4; num_recs = 1; for (;;) { data_start = hfs_bnode_read_u16(node, old_rec_off); if (data_start > size) break; old_rec_off -= 2; if (++num_recs < node->num_recs) continue; /* panic? */ hfs_bnode_put(node); hfs_bnode_put(new_node); if (next_node) hfs_bnode_put(next_node); return ERR_PTR(-ENOSPC); } if (fd->record + 1 < num_recs) { /* new record is in the lower half, * so leave some more space there */ old_rec_off += 2; num_recs--; data_start = hfs_bnode_read_u16(node, old_rec_off); } else { hfs_bnode_put(node); hfs_bnode_get(new_node); fd->bnode = new_node; fd->record -= num_recs; fd->keyoffset -= data_start - 14; fd->entryoffset -= data_start - 14; } new_node->num_recs = node->num_recs - num_recs; node->num_recs = num_recs; new_rec_off = tree->node_size - 2; new_off = 14; size = data_start - new_off; num_recs = new_node->num_recs; data_end = data_start; while (num_recs) { hfs_bnode_write_u16(new_node, new_rec_off, new_off); old_rec_off -= 2; new_rec_off -= 2; data_end = hfs_bnode_read_u16(node, old_rec_off); new_off = data_end - size; num_recs--; } hfs_bnode_write_u16(new_node, new_rec_off, new_off); hfs_bnode_copy(new_node, 14, node, data_start, data_end - data_start); /* update new bnode header */ node_desc.next = cpu_to_be32(new_node->next); node_desc.prev = cpu_to_be32(new_node->prev); node_desc.type = new_node->type; node_desc.height = new_node->height; node_desc.num_recs = cpu_to_be16(new_node->num_recs); node_desc.reserved = 0; hfs_bnode_write(new_node, &node_desc, 0, sizeof(node_desc)); /* update previous bnode header */ node->next = new_node->this; hfs_bnode_read(node, &node_desc, 0, sizeof(node_desc)); node_desc.next = cpu_to_be32(node->next); node_desc.num_recs = cpu_to_be16(node->num_recs); hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc)); /* update next bnode header */ if (next_node) { next_node->prev = new_node->this; hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc)); node_desc.prev = cpu_to_be32(next_node->prev); hfs_bnode_write(next_node, &node_desc, 0, sizeof(node_desc)); hfs_bnode_put(next_node); } else if (node->this == tree->leaf_tail) { /* if there is no next node, this might be the new tail */ tree->leaf_tail = new_node->this; mark_inode_dirty(tree->inode); } hfs_bnode_dump(node); hfs_bnode_dump(new_node); hfs_bnode_put(node); return new_node; } static int hfs_brec_update_parent(struct hfs_find_data *fd) { struct hfs_btree *tree; struct hfs_bnode *node, *new_node, *parent; int newkeylen, diff; int rec, rec_off, end_rec_off; int start_off, end_off; tree = fd->tree; node = fd->bnode; new_node = NULL; if (!node->parent) return 0; again: parent = hfs_bnode_find(tree, node->parent); if (IS_ERR(parent)) return PTR_ERR(parent); __hfs_brec_find(parent, fd, hfs_find_rec_by_key); if (fd->record < 0) return -ENOENT; hfs_bnode_dump(parent); rec = fd->record; /* size difference between old and new key */ if ((tree->attributes & HFS_TREE_VARIDXKEYS) || (tree->cnid == HFSPLUS_ATTR_CNID)) newkeylen = hfs_bnode_read_u16(node, 14) + 2; else fd->keylength = newkeylen = tree->max_key_len + 2; hfs_dbg(BNODE_MOD, "update_rec: %d, %d, %d\n", rec, fd->keylength, newkeylen); rec_off = tree->node_size - (rec + 2) * 2; end_rec_off = tree->node_size - (parent->num_recs + 1) * 2; diff = newkeylen - fd->keylength; if (!diff) goto skip; if (diff > 0) { end_off = hfs_bnode_read_u16(parent, end_rec_off); if (end_rec_off - end_off < diff) { hfs_dbg(BNODE_MOD, "splitting index node\n"); fd->bnode = parent; new_node = hfs_bnode_split(fd); if (IS_ERR(new_node)) return PTR_ERR(new_node); parent = fd->bnode; rec = fd->record; rec_off = tree->node_size - (rec + 2) * 2; end_rec_off = tree->node_size - (parent->num_recs + 1) * 2; } } end_off = start_off = hfs_bnode_read_u16(parent, rec_off); hfs_bnode_write_u16(parent, rec_off, start_off + diff); start_off -= 4; /* move previous cnid too */ while (rec_off > end_rec_off) { rec_off -= 2; end_off = hfs_bnode_read_u16(parent, rec_off); hfs_bnode_write_u16(parent, rec_off, end_off + diff); } hfs_bnode_move(parent, start_off + diff, start_off, end_off - start_off); skip: hfs_bnode_copy(parent, fd->keyoffset, node, 14, newkeylen); hfs_bnode_dump(parent); hfs_bnode_put(node); node = parent; if (new_node) { __be32 cnid; if (!new_node->parent) { hfs_btree_inc_height(tree); new_node->parent = tree->root; } fd->bnode = hfs_bnode_find(tree, new_node->parent); /* create index key and entry */ hfs_bnode_read_key(new_node, fd->search_key, 14); cnid = cpu_to_be32(new_node->this); __hfs_brec_find(fd->bnode, fd, hfs_find_rec_by_key); hfs_brec_insert(fd, &cnid, sizeof(cnid)); hfs_bnode_put(fd->bnode); hfs_bnode_put(new_node); if (!rec) { if (new_node == node) goto out; /* restore search_key */ hfs_bnode_read_key(node, fd->search_key, 14); } new_node = NULL; } if (!rec && node->parent) goto again; out: fd->bnode = node; return 0; } static int hfs_btree_inc_height(struct hfs_btree *tree) { struct hfs_bnode *node, *new_node; struct hfs_bnode_desc node_desc; int key_size, rec; __be32 cnid; node = NULL; if (tree->root) { node = hfs_bnode_find(tree, tree->root); if (IS_ERR(node)) return PTR_ERR(node); } new_node = hfs_bmap_alloc(tree); if (IS_ERR(new_node)) { hfs_bnode_put(node); return PTR_ERR(new_node); } tree->root = new_node->this; if (!tree->depth) { tree->leaf_head = tree->leaf_tail = new_node->this; new_node->type = HFS_NODE_LEAF; new_node->num_recs = 0; } else { new_node->type = HFS_NODE_INDEX; new_node->num_recs = 1; } new_node->parent = 0; new_node->next = 0; new_node->prev = 0; new_node->height = ++tree->depth; node_desc.next = cpu_to_be32(new_node->next); node_desc.prev = cpu_to_be32(new_node->prev); node_desc.type = new_node->type; node_desc.height = new_node->height; node_desc.num_recs = cpu_to_be16(new_node->num_recs); node_desc.reserved = 0; hfs_bnode_write(new_node, &node_desc, 0, sizeof(node_desc)); rec = tree->node_size - 2; hfs_bnode_write_u16(new_node, rec, 14); if (node) { /* insert old root idx into new root */ node->parent = tree->root; if (node->type == HFS_NODE_LEAF || tree->attributes & HFS_TREE_VARIDXKEYS || tree->cnid == HFSPLUS_ATTR_CNID) key_size = hfs_bnode_read_u16(node, 14) + 2; else key_size = tree->max_key_len + 2; hfs_bnode_copy(new_node, 14, node, 14, key_size); if (!(tree->attributes & HFS_TREE_VARIDXKEYS) && (tree->cnid != HFSPLUS_ATTR_CNID)) { key_size = tree->max_key_len + 2; hfs_bnode_write_u16(new_node, 14, tree->max_key_len); } cnid = cpu_to_be32(node->this); hfs_bnode_write(new_node, &cnid, 14 + key_size, 4); rec -= 2; hfs_bnode_write_u16(new_node, rec, 14 + key_size + 4); hfs_bnode_put(node); } hfs_bnode_put(new_node); mark_inode_dirty(tree->inode); return 0; } |
1 1 1 1 1 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 | /* * linux/fs/nls/nls_cp1255.c * * Charset cp1255 translation tables. * The Unicode to charset table has only exact mappings. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00*/ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10*/ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20*/ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30*/ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40*/ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50*/ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60*/ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70*/ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80*/ 0x20ac, 0x0000, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021, 0x02c6, 0x2030, 0x0000, 0x2039, 0x0000, 0x0000, 0x0000, 0x0000, /* 0x90*/ 0x0000, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, 0x02dc, 0x2122, 0x0000, 0x203a, 0x0000, 0x0000, 0x0000, 0x0000, /* 0xa0*/ 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x20aa, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00d7, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x203e, /* 0xb0*/ 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00f7, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, /* 0xc0*/ 0x05b0, 0x05b1, 0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x0000, 0x05bb, 0x05bc, 0x05bd, 0x05be, 0x05bf, /* 0xd0*/ 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f0, 0x05f1, 0x05f2, 0x05f3, 0x05f4, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x2017, /* 0xe0*/ 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x05d5, 0x05d6, 0x05d7, 0x05d8, 0x05d9, 0x05da, 0x05db, 0x05dc, 0x05dd, 0x05de, 0x05df, /* 0xf0*/ 0x05e0, 0x05e1, 0x05e2, 0x05e3, 0x05e4, 0x05e5, 0x05e6, 0x05e7, 0x05e8, 0x05e9, 0x05ea, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xa0, 0x00, 0xa2, 0xa3, 0x00, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0x00, 0xab, 0xac, 0xad, 0xae, 0x00, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0x00, 0xbb, 0xbc, 0xbd, 0xbe, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xaa, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xba, /* 0xf0-0xf7 */ }; static const unsigned char page01[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ }; static const unsigned char page02[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x98, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ }; static const unsigned char page05[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xb0-0xb7 */ 0xc8, 0xc9, 0x00, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xb8-0xbf */ 0xd0, 0xd1, 0xd2, 0xd3, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xd0-0xd7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xd8-0xdf */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xe0-0xe7 */ 0xf8, 0xf9, 0xfa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ }; static const unsigned char page20[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfd, 0xfe, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x96, 0x97, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x91, 0x92, 0x82, 0x00, 0x93, 0x94, 0x84, 0x00, /* 0x18-0x1f */ 0x86, 0x87, 0x95, 0x00, 0x00, 0x00, 0x85, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x8b, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0xa4, 0x00, 0x80, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ }; static const unsigned char page21[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb9, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ }; static const unsigned char *const page_uni2charset[256] = { page00, page01, page02, NULL, NULL, page05, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page20, page21, NULL, NULL, NULL, NULL, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xa0, 0x00, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xa0, 0x00, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0x00, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "cp1255", .alias = "iso8859-8", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_cp1255(void) { return register_nls(&table); } static void __exit exit_nls_cp1255(void) { unregister_nls(&table); } module_init(init_nls_cp1255) module_exit(exit_nls_cp1255) MODULE_LICENSE("Dual BSD/GPL"); MODULE_ALIAS_NLS(iso8859-8); |
29 29 29 29 164 165 73 73 29 29 60 60 21 21 29 23 29 2 142 14 139 19 15 14 15 11 15 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 2 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 86 86 38 31 38 16 38 32 3 3 29 38 27 3 3 24 38 3 38 38 38 38 1 38 6 5 5 5 5 4 3 5 5 2 5 5 5 5 11 5 11 5 5 5 79 31 31 31 31 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_ialloc.h" #include "xfs_itable.h" #include "xfs_quota.h" #include "xfs_error.h" #include "xfs_bmap.h" #include "xfs_bmap_btree.h" #include "xfs_bmap_util.h" #include "xfs_trans.h" #include "xfs_trans_space.h" #include "xfs_qm.h" #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_cksum.h" /* * The global quota manager. There is only one of these for the entire * system, _not_ one per file system. XQM keeps track of the overall * quota functionality, including maintaining the freelist and hash * tables of dquots. */ STATIC int xfs_qm_init_quotainos(xfs_mount_t *); STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); STATIC void xfs_qm_destroy_quotainos(xfs_quotainfo_t *qi); STATIC void xfs_qm_dqfree_one(struct xfs_dquot *dqp); /* * We use the batch lookup interface to iterate over the dquots as it * currently is the only interface into the radix tree code that allows * fuzzy lookups instead of exact matches. Holding the lock over multiple * operations is fine as all callers are used either during mount/umount * or quotaoff. */ #define XFS_DQ_LOOKUP_BATCH 32 STATIC int xfs_qm_dquot_walk( struct xfs_mount *mp, int type, int (*execute)(struct xfs_dquot *dqp, void *data), void *data) { struct xfs_quotainfo *qi = mp->m_quotainfo; struct radix_tree_root *tree = xfs_dquot_tree(qi, type); uint32_t next_index; int last_error = 0; int skipped; int nr_found; restart: skipped = 0; next_index = 0; nr_found = 0; while (1) { struct xfs_dquot *batch[XFS_DQ_LOOKUP_BATCH]; int error = 0; int i; mutex_lock(&qi->qi_tree_lock); nr_found = radix_tree_gang_lookup(tree, (void **)batch, next_index, XFS_DQ_LOOKUP_BATCH); if (!nr_found) { mutex_unlock(&qi->qi_tree_lock); break; } for (i = 0; i < nr_found; i++) { struct xfs_dquot *dqp = batch[i]; next_index = be32_to_cpu(dqp->q_core.d_id) + 1; error = execute(batch[i], data); if (error == -EAGAIN) { skipped++; continue; } if (error && last_error != -EFSCORRUPTED) last_error = error; } mutex_unlock(&qi->qi_tree_lock); /* bail out if the filesystem is corrupted. */ if (last_error == -EFSCORRUPTED) { skipped = 0; break; } /* we're done if id overflows back to zero */ if (!next_index) break; } if (skipped) { delay(1); goto restart; } return last_error; } /* * Purge a dquot from all tracking data structures and free it. */ STATIC int xfs_qm_dqpurge( struct xfs_dquot *dqp, void *data) { struct xfs_mount *mp = dqp->q_mount; struct xfs_quotainfo *qi = mp->m_quotainfo; xfs_dqlock(dqp); if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) { xfs_dqunlock(dqp); return -EAGAIN; } dqp->dq_flags |= XFS_DQ_FREEING; xfs_dqflock(dqp); /* * If we are turning this type of quotas off, we don't care * about the dirty metadata sitting in this dquot. OTOH, if * we're unmounting, we do care, so we flush it and wait. */ if (XFS_DQ_IS_DIRTY(dqp)) { struct xfs_buf *bp = NULL; int error; /* * We don't care about getting disk errors here. We need * to purge this dquot anyway, so we go ahead regardless. */ error = xfs_qm_dqflush(dqp, &bp); if (!error) { error = xfs_bwrite(bp); xfs_buf_relse(bp); } xfs_dqflock(dqp); } ASSERT(atomic_read(&dqp->q_pincount) == 0); ASSERT(XFS_FORCED_SHUTDOWN(mp) || !test_bit(XFS_LI_IN_AIL, &dqp->q_logitem.qli_item.li_flags)); xfs_dqfunlock(dqp); xfs_dqunlock(dqp); radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags), be32_to_cpu(dqp->q_core.d_id)); qi->qi_dquots--; /* * We move dquots to the freelist as soon as their reference count * hits zero, so it really should be on the freelist here. */ ASSERT(!list_empty(&dqp->q_lru)); list_lru_del(&qi->qi_lru, &dqp->q_lru); XFS_STATS_DEC(mp, xs_qm_dquot_unused); xfs_qm_dqdestroy(dqp); return 0; } /* * Purge the dquot cache. */ void xfs_qm_dqpurge_all( struct xfs_mount *mp, uint flags) { if (flags & XFS_QMOPT_UQUOTA) xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge, NULL); if (flags & XFS_QMOPT_GQUOTA) xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge, NULL); if (flags & XFS_QMOPT_PQUOTA) xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_dqpurge, NULL); } /* * Just destroy the quotainfo structure. */ void xfs_qm_unmount( struct xfs_mount *mp) { if (mp->m_quotainfo) { xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL); xfs_qm_destroy_quotainfo(mp); } } /* * Called from the vfsops layer. */ void xfs_qm_unmount_quotas( xfs_mount_t *mp) { /* * Release the dquots that root inode, et al might be holding, * before we flush quotas and blow away the quotainfo structure. */ ASSERT(mp->m_rootip); xfs_qm_dqdetach(mp->m_rootip); if (mp->m_rbmip) xfs_qm_dqdetach(mp->m_rbmip); if (mp->m_rsumip) xfs_qm_dqdetach(mp->m_rsumip); /* * Release the quota inodes. */ if (mp->m_quotainfo) { if (mp->m_quotainfo->qi_uquotaip) { xfs_irele(mp->m_quotainfo->qi_uquotaip); mp->m_quotainfo->qi_uquotaip = NULL; } if (mp->m_quotainfo->qi_gquotaip) { xfs_irele(mp->m_quotainfo->qi_gquotaip); mp->m_quotainfo->qi_gquotaip = NULL; } if (mp->m_quotainfo->qi_pquotaip) { xfs_irele(mp->m_quotainfo->qi_pquotaip); mp->m_quotainfo->qi_pquotaip = NULL; } } } STATIC int xfs_qm_dqattach_one( xfs_inode_t *ip, xfs_dqid_t id, uint type, bool doalloc, xfs_dquot_t **IO_idqpp) { xfs_dquot_t *dqp; int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); error = 0; /* * See if we already have it in the inode itself. IO_idqpp is &i_udquot * or &i_gdquot. This made the code look weird, but made the logic a lot * simpler. */ dqp = *IO_idqpp; if (dqp) { trace_xfs_dqattach_found(dqp); return 0; } /* * Find the dquot from somewhere. This bumps the reference count of * dquot and returns it locked. This can return ENOENT if dquot didn't * exist on disk and we didn't ask it to allocate; ESRCH if quotas got * turned off suddenly. */ error = xfs_qm_dqget_inode(ip, type, doalloc, &dqp); if (error) return error; trace_xfs_dqattach_get(dqp); /* * dqget may have dropped and re-acquired the ilock, but it guarantees * that the dquot returned is the one that should go in the inode. */ *IO_idqpp = dqp; xfs_dqunlock(dqp); return 0; } static bool xfs_qm_need_dqattach( struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; if (!XFS_IS_QUOTA_RUNNING(mp)) return false; if (!XFS_IS_QUOTA_ON(mp)) return false; if (!XFS_NOT_DQATTACHED(mp, ip)) return false; if (xfs_is_quota_inode(&mp->m_sb, ip->i_ino)) return false; return true; } /* * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON * into account. * If @doalloc is true, the dquot(s) will be allocated if needed. * Inode may get unlocked and relocked in here, and the caller must deal with * the consequences. */ int xfs_qm_dqattach_locked( xfs_inode_t *ip, bool doalloc) { xfs_mount_t *mp = ip->i_mount; int error = 0; if (!xfs_qm_need_dqattach(ip)) return 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) { error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER, doalloc, &ip->i_udquot); if (error) goto done; ASSERT(ip->i_udquot); } if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) { error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, doalloc, &ip->i_gdquot); if (error) goto done; ASSERT(ip->i_gdquot); } if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) { error = xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ, doalloc, &ip->i_pdquot); if (error) goto done; ASSERT(ip->i_pdquot); } done: /* * Don't worry about the dquots that we may have attached before any * error - they'll get detached later if it has not already been done. */ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); return error; } int xfs_qm_dqattach( struct xfs_inode *ip) { int error; if (!xfs_qm_need_dqattach(ip)) return 0; xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_qm_dqattach_locked(ip, false); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } /* * Release dquots (and their references) if any. * The inode should be locked EXCL except when this's called by * xfs_ireclaim. */ void xfs_qm_dqdetach( xfs_inode_t *ip) { if (!(ip->i_udquot || ip->i_gdquot || ip->i_pdquot)) return; trace_xfs_dquot_dqdetach(ip); ASSERT(!xfs_is_quota_inode(&ip->i_mount->m_sb, ip->i_ino)); if (ip->i_udquot) { xfs_qm_dqrele(ip->i_udquot); ip->i_udquot = NULL; } if (ip->i_gdquot) { xfs_qm_dqrele(ip->i_gdquot); ip->i_gdquot = NULL; } if (ip->i_pdquot) { xfs_qm_dqrele(ip->i_pdquot); ip->i_pdquot = NULL; } } struct xfs_qm_isolate { struct list_head buffers; struct list_head dispose; }; static enum lru_status xfs_qm_dquot_isolate( struct list_head *item, struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) __releases(lru_lock) __acquires(lru_lock) { struct xfs_dquot *dqp = container_of(item, struct xfs_dquot, q_lru); struct xfs_qm_isolate *isol = arg; if (!xfs_dqlock_nowait(dqp)) goto out_miss_busy; /* * This dquot has acquired a reference in the meantime remove it from * the freelist and try again. */ if (dqp->q_nrefs) { xfs_dqunlock(dqp); XFS_STATS_INC(dqp->q_mount, xs_qm_dqwants); trace_xfs_dqreclaim_want(dqp); list_lru_isolate(lru, &dqp->q_lru); XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused); return LRU_REMOVED; } /* * If the dquot is dirty, flush it. If it's already being flushed, just * skip it so there is time for the IO to complete before we try to * reclaim it again on the next LRU pass. */ if (!xfs_dqflock_nowait(dqp)) { xfs_dqunlock(dqp); goto out_miss_busy; } if (XFS_DQ_IS_DIRTY(dqp)) { struct xfs_buf *bp = NULL; int error; trace_xfs_dqreclaim_dirty(dqp); /* we have to drop the LRU lock to flush the dquot */ spin_unlock(lru_lock); error = xfs_qm_dqflush(dqp, &bp); if (error) goto out_unlock_dirty; xfs_buf_delwri_queue(bp, &isol->buffers); xfs_buf_relse(bp); goto out_unlock_dirty; } xfs_dqfunlock(dqp); /* * Prevent lookups now that we are past the point of no return. */ dqp->dq_flags |= XFS_DQ_FREEING; xfs_dqunlock(dqp); ASSERT(dqp->q_nrefs == 0); list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose); XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused); trace_xfs_dqreclaim_done(dqp); XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaims); return LRU_REMOVED; out_miss_busy: trace_xfs_dqreclaim_busy(dqp); XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses); return LRU_SKIP; out_unlock_dirty: trace_xfs_dqreclaim_busy(dqp); XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses); xfs_dqunlock(dqp); spin_lock(lru_lock); return LRU_RETRY; } static unsigned long xfs_qm_shrink_scan( struct shrinker *shrink, struct shrink_control *sc) { struct xfs_quotainfo *qi = container_of(shrink, struct xfs_quotainfo, qi_shrinker); struct xfs_qm_isolate isol; unsigned long freed; int error; if ((sc->gfp_mask & (__GFP_FS|__GFP_DIRECT_RECLAIM)) != (__GFP_FS|__GFP_DIRECT_RECLAIM)) return 0; INIT_LIST_HEAD(&isol.buffers); INIT_LIST_HEAD(&isol.dispose); freed = list_lru_shrink_walk(&qi->qi_lru, sc, xfs_qm_dquot_isolate, &isol); error = xfs_buf_delwri_submit(&isol.buffers); if (error) xfs_warn(NULL, "%s: dquot reclaim failed", __func__); while (!list_empty(&isol.dispose)) { struct xfs_dquot *dqp; dqp = list_first_entry(&isol.dispose, struct xfs_dquot, q_lru); list_del_init(&dqp->q_lru); xfs_qm_dqfree_one(dqp); } return freed; } static unsigned long xfs_qm_shrink_count( struct shrinker *shrink, struct shrink_control *sc) { struct xfs_quotainfo *qi = container_of(shrink, struct xfs_quotainfo, qi_shrinker); return list_lru_shrink_count(&qi->qi_lru, sc); } STATIC void xfs_qm_set_defquota( xfs_mount_t *mp, uint type, xfs_quotainfo_t *qinf) { xfs_dquot_t *dqp; struct xfs_def_quota *defq; struct xfs_disk_dquot *ddqp; int error; error = xfs_qm_dqget_uncached(mp, 0, type, &dqp); if (error) return; ddqp = &dqp->q_core; defq = xfs_get_defquota(dqp, qinf); /* * Timers and warnings have been already set, let's just set the * default limits for this quota type */ defq->bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit); defq->bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit); defq->ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit); defq->isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit); defq->rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); defq->rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit); xfs_qm_dqdestroy(dqp); } /* Initialize quota time limits from the root dquot. */ static void xfs_qm_init_timelimits( struct xfs_mount *mp, struct xfs_quotainfo *qinf) { struct xfs_disk_dquot *ddqp; struct xfs_dquot *dqp; uint type; int error; qinf->qi_btimelimit = XFS_QM_BTIMELIMIT; qinf->qi_itimelimit = XFS_QM_ITIMELIMIT; qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT; qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT; qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT; qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT; /* * We try to get the limits from the superuser's limits fields. * This is quite hacky, but it is standard quota practice. * * Since we may not have done a quotacheck by this point, just read * the dquot without attaching it to any hashtables or lists. * * Timers and warnings are globally set by the first timer found in * user/group/proj quota types, otherwise a default value is used. * This should be split into different fields per quota type. */ if (XFS_IS_UQUOTA_RUNNING(mp)) type = XFS_DQ_USER; else if (XFS_IS_GQUOTA_RUNNING(mp)) type = XFS_DQ_GROUP; else type = XFS_DQ_PROJ; error = xfs_qm_dqget_uncached(mp, 0, type, &dqp); if (error) return; ddqp = &dqp->q_core; /* * The warnings and timers set the grace period given to * a user or group before he or she can not perform any * more writing. If it is zero, a default is used. */ if (ddqp->d_btimer) qinf->qi_btimelimit = be32_to_cpu(ddqp->d_btimer); if (ddqp->d_itimer) qinf->qi_itimelimit = be32_to_cpu(ddqp->d_itimer); if (ddqp->d_rtbtimer) qinf->qi_rtbtimelimit = be32_to_cpu(ddqp->d_rtbtimer); if (ddqp->d_bwarns) qinf->qi_bwarnlimit = be16_to_cpu(ddqp->d_bwarns); if (ddqp->d_iwarns) qinf->qi_iwarnlimit = be16_to_cpu(ddqp->d_iwarns); if (ddqp->d_rtbwarns) qinf->qi_rtbwarnlimit = be16_to_cpu(ddqp->d_rtbwarns); xfs_qm_dqdestroy(dqp); } /* * This initializes all the quota information that's kept in the * mount structure */ STATIC int xfs_qm_init_quotainfo( struct xfs_mount *mp) { struct xfs_quotainfo *qinf; int error; ASSERT(XFS_IS_QUOTA_RUNNING(mp)); qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP); error = list_lru_init(&qinf->qi_lru); if (error) goto out_free_qinf; /* * See if quotainodes are setup, and if not, allocate them, * and change the superblock accordingly. */ error = xfs_qm_init_quotainos(mp); if (error) goto out_free_lru; INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS); INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS); INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_NOFS); mutex_init(&qinf->qi_tree_lock); /* mutex used to serialize quotaoffs */ mutex_init(&qinf->qi_quotaofflock); /* Precalc some constants */ qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); qinf->qi_dqperchunk = xfs_calc_dquots_per_chunk(qinf->qi_dqchunklen); mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD); xfs_qm_init_timelimits(mp, qinf); if (XFS_IS_UQUOTA_RUNNING(mp)) xfs_qm_set_defquota(mp, XFS_DQ_USER, qinf); if (XFS_IS_GQUOTA_RUNNING(mp)) xfs_qm_set_defquota(mp, XFS_DQ_GROUP, qinf); if (XFS_IS_PQUOTA_RUNNING(mp)) xfs_qm_set_defquota(mp, XFS_DQ_PROJ, qinf); qinf->qi_shrinker.count_objects = xfs_qm_shrink_count; qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan; qinf->qi_shrinker.seeks = DEFAULT_SEEKS; qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE; error = register_shrinker(&qinf->qi_shrinker); if (error) goto out_free_inos; return 0; out_free_inos: mutex_destroy(&qinf->qi_quotaofflock); mutex_destroy(&qinf->qi_tree_lock); xfs_qm_destroy_quotainos(qinf); out_free_lru: list_lru_destroy(&qinf->qi_lru); out_free_qinf: kmem_free(qinf); mp->m_quotainfo = NULL; return error; } /* * Gets called when unmounting a filesystem or when all quotas get * turned off. * This purges the quota inodes, destroys locks and frees itself. */ void xfs_qm_destroy_quotainfo( xfs_mount_t *mp) { xfs_quotainfo_t *qi; qi = mp->m_quotainfo; ASSERT(qi != NULL); unregister_shrinker(&qi->qi_shrinker); list_lru_destroy(&qi->qi_lru); xfs_qm_destroy_quotainos(qi); mutex_destroy(&qi->qi_tree_lock); mutex_destroy(&qi->qi_quotaofflock); kmem_free(qi); mp->m_quotainfo = NULL; } /* * Create an inode and return with a reference already taken, but unlocked * This is how we create quota inodes */ STATIC int xfs_qm_qino_alloc( xfs_mount_t *mp, xfs_inode_t **ip, uint flags) { xfs_trans_t *tp; int error; bool need_alloc = true; *ip = NULL; /* * With superblock that doesn't have separate pquotino, we * share an inode between gquota and pquota. If the on-disk * superblock has GQUOTA and the filesystem is now mounted * with PQUOTA, just use sb_gquotino for sb_pquotino and * vice-versa. */ if (!xfs_sb_version_has_pquotino(&mp->m_sb) && (flags & (XFS_QMOPT_PQUOTA|XFS_QMOPT_GQUOTA))) { xfs_ino_t ino = NULLFSINO; if ((flags & XFS_QMOPT_PQUOTA) && (mp->m_sb.sb_gquotino != NULLFSINO)) { ino = mp->m_sb.sb_gquotino; ASSERT(mp->m_sb.sb_pquotino == NULLFSINO); } else if ((flags & XFS_QMOPT_GQUOTA) && (mp->m_sb.sb_pquotino != NULLFSINO)) { ino = mp->m_sb.sb_pquotino; ASSERT(mp->m_sb.sb_gquotino == NULLFSINO); } if (ino != NULLFSINO) { error = xfs_iget(mp, NULL, ino, 0, 0, ip); if (error) return error; mp->m_sb.sb_gquotino = NULLFSINO; mp->m_sb.sb_pquotino = NULLFSINO; need_alloc = false; } } error = xfs_trans_alloc(mp, &M_RES(mp)->tr_create, XFS_QM_QINOCREATE_SPACE_RES(mp), 0, 0, &tp); if (error) return error; if (need_alloc) { error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, ip); if (error) { xfs_trans_cancel(tp); return error; } } /* * Make the changes in the superblock, and log those too. * sbfields arg may contain fields other than *QUOTINO; * VERSIONNUM for example. */ spin_lock(&mp->m_sb_lock); if (flags & XFS_QMOPT_SBVERSION) { ASSERT(!xfs_sb_version_hasquota(&mp->m_sb)); xfs_sb_version_addquota(&mp->m_sb); mp->m_sb.sb_uquotino = NULLFSINO; mp->m_sb.sb_gquotino = NULLFSINO; mp->m_sb.sb_pquotino = NULLFSINO; /* qflags will get updated fully _after_ quotacheck */ mp->m_sb.sb_qflags = mp->m_qflags & XFS_ALL_QUOTA_ACCT; } if (flags & XFS_QMOPT_UQUOTA) mp->m_sb.sb_uquotino = (*ip)->i_ino; else if (flags & XFS_QMOPT_GQUOTA) mp->m_sb.sb_gquotino = (*ip)->i_ino; else mp->m_sb.sb_pquotino = (*ip)->i_ino; spin_unlock(&mp->m_sb_lock); xfs_log_sb(tp); error = xfs_trans_commit(tp); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); xfs_alert(mp, "%s failed (error %d)!", __func__, error); } if (need_alloc) xfs_finish_inode_setup(*ip); return error; } STATIC void xfs_qm_reset_dqcounts( xfs_mount_t *mp, xfs_buf_t *bp, xfs_dqid_t id, uint type) { struct xfs_dqblk *dqb; int j; xfs_failaddr_t fa; trace_xfs_reset_dqcounts(bp, _RET_IP_); /* * Reset all counters and timers. They'll be * started afresh by xfs_qm_quotacheck. */ #ifdef DEBUG j = (int)XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) / sizeof(xfs_dqblk_t); ASSERT(mp->m_quotainfo->qi_dqperchunk == j); #endif dqb = bp->b_addr; for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) { struct xfs_disk_dquot *ddq; ddq = (struct xfs_disk_dquot *)&dqb[j]; /* * Do a sanity check, and if needed, repair the dqblk. Don't * output any warnings because it's perfectly possible to * find uninitialised dquot blks. See comment in * xfs_dquot_verify. */ fa = xfs_dqblk_verify(mp, &dqb[j], id + j, type); if (fa) xfs_dqblk_repair(mp, &dqb[j], id + j, type); /* * Reset type in case we are reusing group quota file for * project quotas or vice versa */ ddq->d_flags = type; ddq->d_bcount = 0; ddq->d_icount = 0; ddq->d_rtbcount = 0; ddq->d_btimer = 0; ddq->d_itimer = 0; ddq->d_rtbtimer = 0; ddq->d_bwarns = 0; ddq->d_iwarns = 0; ddq->d_rtbwarns = 0; if (xfs_sb_version_hascrc(&mp->m_sb)) { xfs_update_cksum((char *)&dqb[j], sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); } } } STATIC int xfs_qm_reset_dqcounts_all( struct xfs_mount *mp, xfs_dqid_t firstid, xfs_fsblock_t bno, xfs_filblks_t blkcnt, uint flags, struct list_head *buffer_list) { struct xfs_buf *bp; int error; int type; ASSERT(blkcnt > 0); type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER : (flags & XFS_QMOPT_PQUOTA ? XFS_DQ_PROJ : XFS_DQ_GROUP); error = 0; /* * Blkcnt arg can be a very big number, and might even be * larger than the log itself. So, we have to break it up into * manageable-sized transactions. * Note that we don't start a permanent transaction here; we might * not be able to get a log reservation for the whole thing up front, * and we don't really care to either, because we just discard * everything if we were to crash in the middle of this loop. */ while (blkcnt--) { error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, bno), mp->m_quotainfo->qi_dqchunklen, 0, &bp, &xfs_dquot_buf_ops); /* * CRC and validation errors will return a EFSCORRUPTED here. If * this occurs, re-read without CRC validation so that we can * repair the damage via xfs_qm_reset_dqcounts(). This process * will leave a trace in the log indicating corruption has * been detected. */ if (error == -EFSCORRUPTED) { error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, bno), mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL); } if (error) break; /* * A corrupt buffer might not have a verifier attached, so * make sure we have the correct one attached before writeback * occurs. */ bp->b_ops = &xfs_dquot_buf_ops; xfs_qm_reset_dqcounts(mp, bp, firstid, type); xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); /* goto the next block. */ bno++; firstid += mp->m_quotainfo->qi_dqperchunk; } return error; } /* * Iterate over all allocated dquot blocks in this quota inode, zeroing all * counters for every chunk of dquots that we find. */ STATIC int xfs_qm_reset_dqcounts_buf( struct xfs_mount *mp, struct xfs_inode *qip, uint flags, struct list_head *buffer_list) { struct xfs_bmbt_irec *map; int i, nmaps; /* number of map entries */ int error; /* return value */ xfs_fileoff_t lblkno; xfs_filblks_t maxlblkcnt; xfs_dqid_t firstid; xfs_fsblock_t rablkno; xfs_filblks_t rablkcnt; error = 0; /* * This looks racy, but we can't keep an inode lock across a * trans_reserve. But, this gets called during quotacheck, and that * happens only at mount time which is single threaded. */ if (qip->i_d.di_nblocks == 0) return 0; map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP); lblkno = 0; maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); do { uint lock_mode; nmaps = XFS_DQITER_MAP_SIZE; /* * We aren't changing the inode itself. Just changing * some of its data. No new blocks are added here, and * the inode is never added to the transaction. */ lock_mode = xfs_ilock_data_map_shared(qip); error = xfs_bmapi_read(qip, lblkno, maxlblkcnt - lblkno, map, &nmaps, 0); xfs_iunlock(qip, lock_mode); if (error) break; ASSERT(nmaps <= XFS_DQITER_MAP_SIZE); for (i = 0; i < nmaps; i++) { ASSERT(map[i].br_startblock != DELAYSTARTBLOCK); ASSERT(map[i].br_blockcount); lblkno += map[i].br_blockcount; if (map[i].br_startblock == HOLESTARTBLOCK) continue; firstid = (xfs_dqid_t) map[i].br_startoff * mp->m_quotainfo->qi_dqperchunk; /* * Do a read-ahead on the next extent. */ if ((i+1 < nmaps) && (map[i+1].br_startblock != HOLESTARTBLOCK)) { rablkcnt = map[i+1].br_blockcount; rablkno = map[i+1].br_startblock; while (rablkcnt--) { xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, rablkno), mp->m_quotainfo->qi_dqchunklen, &xfs_dquot_buf_ops); rablkno++; } } /* * Iterate thru all the blks in the extent and * reset the counters of all the dquots inside them. */ error = xfs_qm_reset_dqcounts_all(mp, firstid, map[i].br_startblock, map[i].br_blockcount, flags, buffer_list); if (error) goto out; } } while (nmaps > 0); out: kmem_free(map); return error; } /* * Called by dqusage_adjust in doing a quotacheck. * * Given the inode, and a dquot id this updates both the incore dqout as well * as the buffer copy. This is so that once the quotacheck is done, we can * just log all the buffers, as opposed to logging numerous updates to * individual dquots. */ STATIC int xfs_qm_quotacheck_dqadjust( struct xfs_inode *ip, uint type, xfs_qcnt_t nblks, xfs_qcnt_t rtblks) { struct xfs_mount *mp = ip->i_mount; struct xfs_dquot *dqp; xfs_dqid_t id; int error; id = xfs_qm_id_for_quotatype(ip, type); error = xfs_qm_dqget(mp, id, type, true, &dqp); if (error) { /* * Shouldn't be able to turn off quotas here. */ ASSERT(error != -ESRCH); ASSERT(error != -ENOENT); return error; } trace_xfs_dqadjust(dqp); /* * Adjust the inode count and the block count to reflect this inode's * resource usage. */ be64_add_cpu(&dqp->q_core.d_icount, 1); dqp->q_res_icount++; if (nblks) { be64_add_cpu(&dqp->q_core.d_bcount, nblks); dqp->q_res_bcount += nblks; } if (rtblks) { be64_add_cpu(&dqp->q_core.d_rtbcount, rtblks); dqp->q_res_rtbcount += rtblks; } /* * Set default limits, adjust timers (since we changed usages) * * There are no timers for the default values set in the root dquot. */ if (dqp->q_core.d_id) { xfs_qm_adjust_dqlimits(mp, dqp); xfs_qm_adjust_dqtimers(mp, &dqp->q_core); } dqp->dq_flags |= XFS_DQ_DIRTY; xfs_qm_dqput(dqp); return 0; } /* * callback routine supplied to bulkstat(). Given an inumber, find its * dquots and update them to account for resources taken by that inode. */ /* ARGSUSED */ STATIC int xfs_qm_dqusage_adjust( xfs_mount_t *mp, /* mount point for filesystem */ xfs_ino_t ino, /* inode number to get data for */ void __user *buffer, /* not used */ int ubsize, /* not used */ int *ubused, /* not used */ int *res) /* result code value */ { xfs_inode_t *ip; xfs_qcnt_t nblks; xfs_filblks_t rtblks = 0; /* total rt blks */ int error; ASSERT(XFS_IS_QUOTA_RUNNING(mp)); /* * rootino must have its resources accounted for, not so with the quota * inodes. */ if (xfs_is_quota_inode(&mp->m_sb, ino)) { *res = BULKSTAT_RV_NOTHING; return -EINVAL; } /* * We don't _need_ to take the ilock EXCL here because quotacheck runs * at mount time and therefore nobody will be racing chown/chproj. */ error = xfs_iget(mp, NULL, ino, XFS_IGET_DONTCACHE, 0, &ip); if (error) { *res = BULKSTAT_RV_NOTHING; return error; } ASSERT(ip->i_delayed_blks == 0); if (XFS_IS_REALTIME_INODE(ip)) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); if (!(ifp->if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); if (error) goto error0; } xfs_bmap_count_leaves(ifp, &rtblks); } nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks; /* * Add the (disk blocks and inode) resources occupied by this * inode to its dquots. We do this adjustment in the incore dquot, * and also copy the changes to its buffer. * We don't care about putting these changes in a transaction * envelope because if we crash in the middle of a 'quotacheck' * we have to start from the beginning anyway. * Once we're done, we'll log all the dquot bufs. * * The *QUOTA_ON checks below may look pretty racy, but quotachecks * and quotaoffs don't race. (Quotachecks happen at mount time only). */ if (XFS_IS_UQUOTA_ON(mp)) { error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_USER, nblks, rtblks); if (error) goto error0; } if (XFS_IS_GQUOTA_ON(mp)) { error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_GROUP, nblks, rtblks); if (error) goto error0; } if (XFS_IS_PQUOTA_ON(mp)) { error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_PROJ, nblks, rtblks); if (error) goto error0; } xfs_irele(ip); *res = BULKSTAT_RV_DIDONE; return 0; error0: xfs_irele(ip); *res = BULKSTAT_RV_GIVEUP; return error; } STATIC int xfs_qm_flush_one( struct xfs_dquot *dqp, void *data) { struct xfs_mount *mp = dqp->q_mount; struct list_head *buffer_list = data; struct xfs_buf *bp = NULL; int error = 0; xfs_dqlock(dqp); if (dqp->dq_flags & XFS_DQ_FREEING) goto out_unlock; if (!XFS_DQ_IS_DIRTY(dqp)) goto out_unlock; /* * The only way the dquot is already flush locked by the time quotacheck * gets here is if reclaim flushed it before the dqadjust walk dirtied * it for the final time. Quotacheck collects all dquot bufs in the * local delwri queue before dquots are dirtied, so reclaim can't have * possibly queued it for I/O. The only way out is to push the buffer to * cycle the flush lock. */ if (!xfs_dqflock_nowait(dqp)) { /* buf is pinned in-core by delwri list */ bp = xfs_buf_incore(mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, 0); if (!bp) { error = -EINVAL; goto out_unlock; } xfs_buf_unlock(bp); xfs_buf_delwri_pushbuf(bp, buffer_list); xfs_buf_rele(bp); error = -EAGAIN; goto out_unlock; } error = xfs_qm_dqflush(dqp, &bp); if (error) goto out_unlock; xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); out_unlock: xfs_dqunlock(dqp); return error; } /* * Walk thru all the filesystem inodes and construct a consistent view * of the disk quota world. If the quotacheck fails, disable quotas. */ STATIC int xfs_qm_quotacheck( xfs_mount_t *mp) { int done, count, error, error2; xfs_ino_t lastino; size_t structsz; uint flags; LIST_HEAD (buffer_list); struct xfs_inode *uip = mp->m_quotainfo->qi_uquotaip; struct xfs_inode *gip = mp->m_quotainfo->qi_gquotaip; struct xfs_inode *pip = mp->m_quotainfo->qi_pquotaip; count = INT_MAX; structsz = 1; lastino = 0; flags = 0; ASSERT(uip || gip || pip); ASSERT(XFS_IS_QUOTA_RUNNING(mp)); xfs_notice(mp, "Quotacheck needed: Please wait."); /* * First we go thru all the dquots on disk, USR and GRP/PRJ, and reset * their counters to zero. We need a clean slate. * We don't log our changes till later. */ if (uip) { error = xfs_qm_reset_dqcounts_buf(mp, uip, XFS_QMOPT_UQUOTA, &buffer_list); if (error) goto error_return; flags |= XFS_UQUOTA_CHKD; } if (gip) { error = xfs_qm_reset_dqcounts_buf(mp, gip, XFS_QMOPT_GQUOTA, &buffer_list); if (error) goto error_return; flags |= XFS_GQUOTA_CHKD; } if (pip) { error = xfs_qm_reset_dqcounts_buf(mp, pip, XFS_QMOPT_PQUOTA, &buffer_list); if (error) goto error_return; flags |= XFS_PQUOTA_CHKD; } do { /* * Iterate thru all the inodes in the file system, * adjusting the corresponding dquot counters in core. */ error = xfs_bulkstat(mp, &lastino, &count, xfs_qm_dqusage_adjust, structsz, NULL, &done); if (error) break; } while (!done); /* * We've made all the changes that we need to make incore. Flush them * down to disk buffers if everything was updated successfully. */ if (XFS_IS_UQUOTA_ON(mp)) { error = xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_flush_one, &buffer_list); } if (XFS_IS_GQUOTA_ON(mp)) { error2 = xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_flush_one, &buffer_list); if (!error) error = error2; } if (XFS_IS_PQUOTA_ON(mp)) { error2 = xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_flush_one, &buffer_list); if (!error) error = error2; } error2 = xfs_buf_delwri_submit(&buffer_list); if (!error) error = error2; /* * We can get this error if we couldn't do a dquot allocation inside * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the * dirty dquots that might be cached, we just want to get rid of them * and turn quotaoff. The dquots won't be attached to any of the inodes * at this point (because we intentionally didn't in dqget_noattach). */ if (error) { xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL); goto error_return; } /* * If one type of quotas is off, then it will lose its * quotachecked status, since we won't be doing accounting for * that type anymore. */ mp->m_qflags &= ~XFS_ALL_QUOTA_CHKD; mp->m_qflags |= flags; error_return: xfs_buf_delwri_cancel(&buffer_list); if (error) { xfs_warn(mp, "Quotacheck: Unsuccessful (Error %d): Disabling quotas.", error); /* * We must turn off quotas. */ ASSERT(mp->m_quotainfo != NULL); xfs_qm_destroy_quotainfo(mp); if (xfs_mount_reset_sbqflags(mp)) { xfs_warn(mp, "Quotacheck: Failed to reset quota flags."); } } else xfs_notice(mp, "Quotacheck: Done."); return error; } /* * This is called from xfs_mountfs to start quotas and initialize all * necessary data structures like quotainfo. This is also responsible for * running a quotacheck as necessary. We are guaranteed that the superblock * is consistently read in at this point. * * If we fail here, the mount will continue with quota turned off. We don't * need to inidicate success or failure at all. */ void xfs_qm_mount_quotas( struct xfs_mount *mp) { int error = 0; uint sbf; /* * If quotas on realtime volumes is not supported, we disable * quotas immediately. */ if (mp->m_sb.sb_rextents) { xfs_notice(mp, "Cannot turn on quotas for realtime filesystem"); mp->m_qflags = 0; goto write_changes; } ASSERT(XFS_IS_QUOTA_RUNNING(mp)); /* * Allocate the quotainfo structure inside the mount struct, and * create quotainode(s), and change/rev superblock if necessary. */ error = xfs_qm_init_quotainfo(mp); if (error) { /* * We must turn off quotas. */ ASSERT(mp->m_quotainfo == NULL); mp->m_qflags = 0; goto write_changes; } /* * If any of the quotas are not consistent, do a quotacheck. */ if (XFS_QM_NEED_QUOTACHECK(mp)) { error = xfs_qm_quotacheck(mp); if (error) { /* Quotacheck failed and disabled quotas. */ return; } } /* * If one type of quotas is off, then it will lose its * quotachecked status, since we won't be doing accounting for * that type anymore. */ if (!XFS_IS_UQUOTA_ON(mp)) mp->m_qflags &= ~XFS_UQUOTA_CHKD; if (!XFS_IS_GQUOTA_ON(mp)) mp->m_qflags &= ~XFS_GQUOTA_CHKD; if (!XFS_IS_PQUOTA_ON(mp)) mp->m_qflags &= ~XFS_PQUOTA_CHKD; write_changes: /* * We actually don't have to acquire the m_sb_lock at all. * This can only be called from mount, and that's single threaded. XXX */ spin_lock(&mp->m_sb_lock); sbf = mp->m_sb.sb_qflags; mp->m_sb.sb_qflags = mp->m_qflags & XFS_MOUNT_QUOTA_ALL; spin_unlock(&mp->m_sb_lock); if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) { if (xfs_sync_sb(mp, false)) { /* * We could only have been turning quotas off. * We aren't in very good shape actually because * the incore structures are convinced that quotas are * off, but the on disk superblock doesn't know that ! */ ASSERT(!(XFS_IS_QUOTA_RUNNING(mp))); xfs_alert(mp, "%s: Superblock update failed!", __func__); } } if (error) { xfs_warn(mp, "Failed to initialize disk quotas."); return; } } /* * This is called after the superblock has been read in and we're ready to * iget the quota inodes. */ STATIC int xfs_qm_init_quotainos( xfs_mount_t *mp) { struct xfs_inode *uip = NULL; struct xfs_inode *gip = NULL; struct xfs_inode *pip = NULL; int error; uint flags = 0; ASSERT(mp->m_quotainfo); /* * Get the uquota and gquota inodes */ if (xfs_sb_version_hasquota(&mp->m_sb)) { if (XFS_IS_UQUOTA_ON(mp) && mp->m_sb.sb_uquotino != NULLFSINO) { ASSERT(mp->m_sb.sb_uquotino > 0); error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &uip); if (error) return error; } if (XFS_IS_GQUOTA_ON(mp) && mp->m_sb.sb_gquotino != NULLFSINO) { ASSERT(mp->m_sb.sb_gquotino > 0); error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &gip); if (error) goto error_rele; } if (XFS_IS_PQUOTA_ON(mp) && mp->m_sb.sb_pquotino != NULLFSINO) { ASSERT(mp->m_sb.sb_pquotino > 0); error = xfs_iget(mp, NULL, mp->m_sb.sb_pquotino, 0, 0, &pip); if (error) goto error_rele; } } else { flags |= XFS_QMOPT_SBVERSION; } /* * Create the three inodes, if they don't exist already. The changes * made above will get added to a transaction and logged in one of * the qino_alloc calls below. If the device is readonly, * temporarily switch to read-write to do this. */ if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) { error = xfs_qm_qino_alloc(mp, &uip, flags | XFS_QMOPT_UQUOTA); if (error) goto error_rele; flags &= ~XFS_QMOPT_SBVERSION; } if (XFS_IS_GQUOTA_ON(mp) && gip == NULL) { error = xfs_qm_qino_alloc(mp, &gip, flags | XFS_QMOPT_GQUOTA); if (error) goto error_rele; flags &= ~XFS_QMOPT_SBVERSION; } if (XFS_IS_PQUOTA_ON(mp) && pip == NULL) { error = xfs_qm_qino_alloc(mp, &pip, flags | XFS_QMOPT_PQUOTA); if (error) goto error_rele; } mp->m_quotainfo->qi_uquotaip = uip; mp->m_quotainfo->qi_gquotaip = gip; mp->m_quotainfo->qi_pquotaip = pip; return 0; error_rele: if (uip) xfs_irele(uip); if (gip) xfs_irele(gip); if (pip) xfs_irele(pip); return error; } STATIC void xfs_qm_destroy_quotainos( xfs_quotainfo_t *qi) { if (qi->qi_uquotaip) { xfs_irele(qi->qi_uquotaip); qi->qi_uquotaip = NULL; /* paranoia */ } if (qi->qi_gquotaip) { xfs_irele(qi->qi_gquotaip); qi->qi_gquotaip = NULL; } if (qi->qi_pquotaip) { xfs_irele(qi->qi_pquotaip); qi->qi_pquotaip = NULL; } } STATIC void xfs_qm_dqfree_one( struct xfs_dquot *dqp) { struct xfs_mount *mp = dqp->q_mount; struct xfs_quotainfo *qi = mp->m_quotainfo; mutex_lock(&qi->qi_tree_lock); radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags), be32_to_cpu(dqp->q_core.d_id)); qi->qi_dquots--; mutex_unlock(&qi->qi_tree_lock); xfs_qm_dqdestroy(dqp); } /* --------------- utility functions for vnodeops ---------------- */ /* * Given an inode, a uid, gid and prid make sure that we have * allocated relevant dquot(s) on disk, and that we won't exceed inode * quotas by creating this file. * This also attaches dquot(s) to the given inode after locking it, * and returns the dquots corresponding to the uid and/or gid. * * in : inode (unlocked) * out : udquot, gdquot with references taken and unlocked */ int xfs_qm_vop_dqalloc( struct xfs_inode *ip, xfs_dqid_t uid, xfs_dqid_t gid, prid_t prid, uint flags, struct xfs_dquot **O_udqpp, struct xfs_dquot **O_gdqpp, struct xfs_dquot **O_pdqpp) { struct xfs_mount *mp = ip->i_mount; struct xfs_dquot *uq = NULL; struct xfs_dquot *gq = NULL; struct xfs_dquot *pq = NULL; int error; uint lockflags; if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) return 0; lockflags = XFS_ILOCK_EXCL; xfs_ilock(ip, lockflags); if ((flags & XFS_QMOPT_INHERIT) && XFS_INHERIT_GID(ip)) gid = ip->i_d.di_gid; /* * Attach the dquot(s) to this inode, doing a dquot allocation * if necessary. The dquot(s) will not be locked. */ if (XFS_NOT_DQATTACHED(mp, ip)) { error = xfs_qm_dqattach_locked(ip, true); if (error) { xfs_iunlock(ip, lockflags); return error; } } if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) { if (ip->i_d.di_uid != uid) { /* * What we need is the dquot that has this uid, and * if we send the inode to dqget, the uid of the inode * takes priority over what's sent in the uid argument. * We must unlock inode here before calling dqget if * we're not sending the inode, because otherwise * we'll deadlock by doing trans_reserve while * holding ilock. */ xfs_iunlock(ip, lockflags); error = xfs_qm_dqget(mp, uid, XFS_DQ_USER, true, &uq); if (error) { ASSERT(error != -ENOENT); return error; } /* * Get the ilock in the right order. */ xfs_dqunlock(uq); lockflags = XFS_ILOCK_SHARED; xfs_ilock(ip, lockflags); } else { /* * Take an extra reference, because we'll return * this to caller */ ASSERT(ip->i_udquot); uq = xfs_qm_dqhold(ip->i_udquot); } } if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { if (ip->i_d.di_gid != gid) { xfs_iunlock(ip, lockflags); error = xfs_qm_dqget(mp, gid, XFS_DQ_GROUP, true, &gq); if (error) { ASSERT(error != -ENOENT); goto error_rele; } xfs_dqunlock(gq); lockflags = XFS_ILOCK_SHARED; xfs_ilock(ip, lockflags); } else { ASSERT(ip->i_gdquot); gq = xfs_qm_dqhold(ip->i_gdquot); } } if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { if (xfs_get_projid(ip) != prid) { xfs_iunlock(ip, lockflags); error = xfs_qm_dqget(mp, (xfs_dqid_t)prid, XFS_DQ_PROJ, true, &pq); if (error) { ASSERT(error != -ENOENT); goto error_rele; } xfs_dqunlock(pq); lockflags = XFS_ILOCK_SHARED; xfs_ilock(ip, lockflags); } else { ASSERT(ip->i_pdquot); pq = xfs_qm_dqhold(ip->i_pdquot); } } if (uq) trace_xfs_dquot_dqalloc(ip); xfs_iunlock(ip, lockflags); if (O_udqpp) *O_udqpp = uq; else xfs_qm_dqrele(uq); if (O_gdqpp) *O_gdqpp = gq; else xfs_qm_dqrele(gq); if (O_pdqpp) *O_pdqpp = pq; else xfs_qm_dqrele(pq); return 0; error_rele: xfs_qm_dqrele(gq); xfs_qm_dqrele(uq); return error; } /* * Actually transfer ownership, and do dquot modifications. * These were already reserved. */ xfs_dquot_t * xfs_qm_vop_chown( xfs_trans_t *tp, xfs_inode_t *ip, xfs_dquot_t **IO_olddq, xfs_dquot_t *newdq) { xfs_dquot_t *prevdq; uint bfield = XFS_IS_REALTIME_INODE(ip) ? XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount)); /* old dquot */ prevdq = *IO_olddq; ASSERT(prevdq); ASSERT(prevdq != newdq); xfs_trans_mod_dquot(tp, prevdq, bfield, -(ip->i_d.di_nblocks)); xfs_trans_mod_dquot(tp, prevdq, XFS_TRANS_DQ_ICOUNT, -1); /* the sparkling new dquot */ xfs_trans_mod_dquot(tp, newdq, bfield, ip->i_d.di_nblocks); xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1); /* * Take an extra reference, because the inode is going to keep * this dquot pointer even after the trans_commit. */ *IO_olddq = xfs_qm_dqhold(newdq); return prevdq; } /* * Quota reservations for setattr(AT_UID|AT_GID|AT_PROJID). */ int xfs_qm_vop_chown_reserve( struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, uint flags) { struct xfs_mount *mp = ip->i_mount; uint delblks, blkflags, prjflags = 0; struct xfs_dquot *udq_unres = NULL; struct xfs_dquot *gdq_unres = NULL; struct xfs_dquot *pdq_unres = NULL; struct xfs_dquot *udq_delblks = NULL; struct xfs_dquot *gdq_delblks = NULL; struct xfs_dquot *pdq_delblks = NULL; int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(XFS_IS_QUOTA_RUNNING(mp)); delblks = ip->i_delayed_blks; blkflags = XFS_IS_REALTIME_INODE(ip) ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS; if (XFS_IS_UQUOTA_ON(mp) && udqp && ip->i_d.di_uid != be32_to_cpu(udqp->q_core.d_id)) { udq_delblks = udqp; /* * If there are delayed allocation blocks, then we have to * unreserve those from the old dquot, and add them to the * new dquot. */ if (delblks) { ASSERT(ip->i_udquot); udq_unres = ip->i_udquot; } } if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp && ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id)) { gdq_delblks = gdqp; if (delblks) { ASSERT(ip->i_gdquot); gdq_unres = ip->i_gdquot; } } if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp && xfs_get_projid(ip) != be32_to_cpu(pdqp->q_core.d_id)) { prjflags = XFS_QMOPT_ENOSPC; pdq_delblks = pdqp; if (delblks) { ASSERT(ip->i_pdquot); pdq_unres = ip->i_pdquot; } } error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount, udq_delblks, gdq_delblks, pdq_delblks, ip->i_d.di_nblocks, 1, flags | blkflags | prjflags); if (error) return error; /* * Do the delayed blks reservations/unreservations now. Since, these * are done without the help of a transaction, if a reservation fails * its previous reservations won't be automatically undone by trans * code. So, we have to do it manually here. */ if (delblks) { /* * Do the reservations first. Unreservation can't fail. */ ASSERT(udq_delblks || gdq_delblks || pdq_delblks); ASSERT(udq_unres || gdq_unres || pdq_unres); error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, udq_delblks, gdq_delblks, pdq_delblks, (xfs_qcnt_t)delblks, 0, flags | blkflags | prjflags); if (error) return error; xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, udq_unres, gdq_unres, pdq_unres, -((xfs_qcnt_t)delblks), 0, blkflags); } return 0; } int xfs_qm_vop_rename_dqattach( struct xfs_inode **i_tab) { struct xfs_mount *mp = i_tab[0]->i_mount; int i; if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) return 0; for (i = 0; (i < 4 && i_tab[i]); i++) { struct xfs_inode *ip = i_tab[i]; int error; /* * Watch out for duplicate entries in the table. */ if (i == 0 || ip != i_tab[i-1]) { if (XFS_NOT_DQATTACHED(mp, ip)) { error = xfs_qm_dqattach(ip); if (error) return error; } } } return 0; } void xfs_qm_vop_create_dqattach( struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp) { struct xfs_mount *mp = tp->t_mountp; if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) return; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(XFS_IS_QUOTA_RUNNING(mp)); if (udqp && XFS_IS_UQUOTA_ON(mp)) { ASSERT(ip->i_udquot == NULL); ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id)); ip->i_udquot = xfs_qm_dqhold(udqp); xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1); } if (gdqp && XFS_IS_GQUOTA_ON(mp)) { ASSERT(ip->i_gdquot == NULL); ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id)); ip->i_gdquot = xfs_qm_dqhold(gdqp); xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); } if (pdqp && XFS_IS_PQUOTA_ON(mp)) { ASSERT(ip->i_pdquot == NULL); ASSERT(xfs_get_projid(ip) == be32_to_cpu(pdqp->q_core.d_id)); ip->i_pdquot = xfs_qm_dqhold(pdqp); xfs_trans_mod_dquot(tp, pdqp, XFS_TRANS_DQ_ICOUNT, 1); } } |
186 16 1 10 76 6 75 75 75 9 9 9 4 4 4 4 4 4 75 75 75 66 242 196 196 66 80 109 109 131 131 131 1 131 80 76 6 80 6 6 197 112 77 77 1 64 66 66 66 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 | /* * fs/f2fs/segment.h * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #include <linux/blkdev.h> #include <linux/backing-dev.h> /* constant macro */ #define NULL_SEGNO ((unsigned int)(~0)) #define NULL_SECNO ((unsigned int)(~0)) #define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */ #define DEF_MAX_RECLAIM_PREFREE_SEGMENTS 4096 /* 8GB in maximum */ #define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */ /* L: Logical segment # in volume, R: Relative segment # in main area */ #define GET_L2R_SEGNO(free_i, segno) ((segno) - (free_i)->start_segno) #define GET_R2L_SEGNO(free_i, segno) ((segno) + (free_i)->start_segno) #define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA) #define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE) #define IS_HOT(t) ((t) == CURSEG_HOT_NODE || (t) == CURSEG_HOT_DATA) #define IS_WARM(t) ((t) == CURSEG_WARM_NODE || (t) == CURSEG_WARM_DATA) #define IS_COLD(t) ((t) == CURSEG_COLD_NODE || (t) == CURSEG_COLD_DATA) #define IS_CURSEG(sbi, seg) \ (((seg) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) #define IS_CURSEC(sbi, secno) \ (((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ (sbi)->segs_per_sec) || \ ((secno) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \ (sbi)->segs_per_sec) || \ ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \ (sbi)->segs_per_sec) || \ ((secno) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \ (sbi)->segs_per_sec) || \ ((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \ (sbi)->segs_per_sec) || \ ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ (sbi)->segs_per_sec)) \ #define MAIN_BLKADDR(sbi) \ (SM_I(sbi) ? SM_I(sbi)->main_blkaddr : \ le32_to_cpu(F2FS_RAW_SUPER(sbi)->main_blkaddr)) #define SEG0_BLKADDR(sbi) \ (SM_I(sbi) ? SM_I(sbi)->seg0_blkaddr : \ le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment0_blkaddr)) #define MAIN_SEGS(sbi) (SM_I(sbi)->main_segments) #define MAIN_SECS(sbi) ((sbi)->total_sections) #define TOTAL_SEGS(sbi) \ (SM_I(sbi) ? SM_I(sbi)->segment_count : \ le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count)) #define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << (sbi)->log_blocks_per_seg) #define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi)) #define SEGMENT_SIZE(sbi) (1ULL << ((sbi)->log_blocksize + \ (sbi)->log_blocks_per_seg)) #define START_BLOCK(sbi, segno) (SEG0_BLKADDR(sbi) + \ (GET_R2L_SEGNO(FREE_I(sbi), segno) << (sbi)->log_blocks_per_seg)) #define NEXT_FREE_BLKADDR(sbi, curseg) \ (START_BLOCK(sbi, (curseg)->segno) + (curseg)->next_blkoff) #define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) ((blk_addr) - SEG0_BLKADDR(sbi)) #define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \ (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> (sbi)->log_blocks_per_seg) #define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \ (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & ((sbi)->blocks_per_seg - 1)) #define GET_SEGNO(sbi, blk_addr) \ ((!is_valid_data_blkaddr(sbi, blk_addr)) ? \ NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ GET_SEGNO_FROM_SEG0(sbi, blk_addr))) #define BLKS_PER_SEC(sbi) \ ((sbi)->segs_per_sec * (sbi)->blocks_per_seg) #define GET_SEC_FROM_SEG(sbi, segno) \ (((segno) == -1) ? -1: (segno) / (sbi)->segs_per_sec) #define GET_SEG_FROM_SEC(sbi, secno) \ ((secno) * (sbi)->segs_per_sec) #define GET_ZONE_FROM_SEC(sbi, secno) \ (((secno) == -1) ? -1: (secno) / (sbi)->secs_per_zone) #define GET_ZONE_FROM_SEG(sbi, segno) \ GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno)) #define GET_SUM_BLOCK(sbi, segno) \ ((sbi)->sm_info->ssa_blkaddr + (segno)) #define GET_SUM_TYPE(footer) ((footer)->entry_type) #define SET_SUM_TYPE(footer, type) ((footer)->entry_type = (type)) #define SIT_ENTRY_OFFSET(sit_i, segno) \ ((segno) % (sit_i)->sents_per_block) #define SIT_BLOCK_OFFSET(segno) \ ((segno) / SIT_ENTRY_PER_BLOCK) #define START_SEGNO(segno) \ (SIT_BLOCK_OFFSET(segno) * SIT_ENTRY_PER_BLOCK) #define SIT_BLK_CNT(sbi) \ ((MAIN_SEGS(sbi) + SIT_ENTRY_PER_BLOCK - 1) / SIT_ENTRY_PER_BLOCK) #define f2fs_bitmap_size(nr) \ (BITS_TO_LONGS(nr) * sizeof(unsigned long)) #define SECTOR_FROM_BLOCK(blk_addr) \ (((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK) #define SECTOR_TO_BLOCK(sectors) \ ((sectors) >> F2FS_LOG_SECTORS_PER_BLOCK) /* * indicate a block allocation direction: RIGHT and LEFT. * RIGHT means allocating new sections towards the end of volume. * LEFT means the opposite direction. */ enum { ALLOC_RIGHT = 0, ALLOC_LEFT }; /* * In the victim_sel_policy->alloc_mode, there are two block allocation modes. * LFS writes data sequentially with cleaning operations. * SSR (Slack Space Recycle) reuses obsolete space without cleaning operations. */ enum { LFS = 0, SSR }; /* * In the victim_sel_policy->gc_mode, there are two gc, aka cleaning, modes. * GC_CB is based on cost-benefit algorithm. * GC_GREEDY is based on greedy algorithm. */ enum { GC_CB = 0, GC_GREEDY, ALLOC_NEXT, FLUSH_DEVICE, MAX_GC_POLICY, }; /* * BG_GC means the background cleaning job. * FG_GC means the on-demand cleaning job. * FORCE_FG_GC means on-demand cleaning job in background. */ enum { BG_GC = 0, FG_GC, FORCE_FG_GC, }; /* for a function parameter to select a victim segment */ struct victim_sel_policy { int alloc_mode; /* LFS or SSR */ int gc_mode; /* GC_CB or GC_GREEDY */ unsigned long *dirty_segmap; /* dirty segment bitmap */ unsigned int max_search; /* maximum # of segments to search */ unsigned int offset; /* last scanned bitmap offset */ unsigned int ofs_unit; /* bitmap search unit */ unsigned int min_cost; /* minimum cost */ unsigned int min_segno; /* segment # having min. cost */ }; struct seg_entry { unsigned int type:6; /* segment type like CURSEG_XXX_TYPE */ unsigned int valid_blocks:10; /* # of valid blocks */ unsigned int ckpt_valid_blocks:10; /* # of valid blocks last cp */ unsigned int padding:6; /* padding */ unsigned char *cur_valid_map; /* validity bitmap of blocks */ #ifdef CONFIG_F2FS_CHECK_FS unsigned char *cur_valid_map_mir; /* mirror of current valid bitmap */ #endif /* * # of valid blocks and the validity bitmap stored in the the last * checkpoint pack. This information is used by the SSR mode. */ unsigned char *ckpt_valid_map; /* validity bitmap of blocks last cp */ unsigned char *discard_map; unsigned long long mtime; /* modification time of the segment */ }; struct sec_entry { unsigned int valid_blocks; /* # of valid blocks in a section */ }; struct segment_allocation { void (*allocate_segment)(struct f2fs_sb_info *, int, bool); }; /* * this value is set in page as a private data which indicate that * the page is atomically written, and it is in inmem_pages list. */ #define ATOMIC_WRITTEN_PAGE ((unsigned long)-1) #define DUMMY_WRITTEN_PAGE ((unsigned long)-2) #define IS_ATOMIC_WRITTEN_PAGE(page) \ (page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE) #define IS_DUMMY_WRITTEN_PAGE(page) \ (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) #define MAX_SKIP_GC_COUNT 16 struct inmem_pages { struct list_head list; struct page *page; block_t old_addr; /* for revoking when fail to commit */ }; struct sit_info { const struct segment_allocation *s_ops; block_t sit_base_addr; /* start block address of SIT area */ block_t sit_blocks; /* # of blocks used by SIT area */ block_t written_valid_blocks; /* # of valid blocks in main area */ char *sit_bitmap; /* SIT bitmap pointer */ #ifdef CONFIG_F2FS_CHECK_FS char *sit_bitmap_mir; /* SIT bitmap mirror */ #endif unsigned int bitmap_size; /* SIT bitmap size */ unsigned long *tmp_map; /* bitmap for temporal use */ unsigned long *dirty_sentries_bitmap; /* bitmap for dirty sentries */ unsigned int dirty_sentries; /* # of dirty sentries */ unsigned int sents_per_block; /* # of SIT entries per block */ struct rw_semaphore sentry_lock; /* to protect SIT cache */ struct seg_entry *sentries; /* SIT segment-level cache */ struct sec_entry *sec_entries; /* SIT section-level cache */ /* for cost-benefit algorithm in cleaning procedure */ unsigned long long elapsed_time; /* elapsed time after mount */ unsigned long long mounted_time; /* mount time */ unsigned long long min_mtime; /* min. modification time */ unsigned long long max_mtime; /* max. modification time */ unsigned int last_victim[MAX_GC_POLICY]; /* last victim segment # */ }; struct free_segmap_info { unsigned int start_segno; /* start segment number logically */ unsigned int free_segments; /* # of free segments */ unsigned int free_sections; /* # of free sections */ spinlock_t segmap_lock; /* free segmap lock */ unsigned long *free_segmap; /* free segment bitmap */ unsigned long *free_secmap; /* free section bitmap */ }; /* Notice: The order of dirty type is same with CURSEG_XXX in f2fs.h */ enum dirty_type { DIRTY_HOT_DATA, /* dirty segments assigned as hot data logs */ DIRTY_WARM_DATA, /* dirty segments assigned as warm data logs */ DIRTY_COLD_DATA, /* dirty segments assigned as cold data logs */ DIRTY_HOT_NODE, /* dirty segments assigned as hot node logs */ DIRTY_WARM_NODE, /* dirty segments assigned as warm node logs */ DIRTY_COLD_NODE, /* dirty segments assigned as cold node logs */ DIRTY, /* to count # of dirty segments */ PRE, /* to count # of entirely obsolete segments */ NR_DIRTY_TYPE }; struct dirty_seglist_info { const struct victim_selection *v_ops; /* victim selction operation */ unsigned long *dirty_segmap[NR_DIRTY_TYPE]; struct mutex seglist_lock; /* lock for segment bitmaps */ int nr_dirty[NR_DIRTY_TYPE]; /* # of dirty segments */ unsigned long *victim_secmap; /* background GC victims */ }; /* victim selection function for cleaning and SSR */ struct victim_selection { int (*get_victim)(struct f2fs_sb_info *, unsigned int *, int, int, char); }; /* for active log information */ struct curseg_info { struct mutex curseg_mutex; /* lock for consistency */ struct f2fs_summary_block *sum_blk; /* cached summary block */ struct rw_semaphore journal_rwsem; /* protect journal area */ struct f2fs_journal *journal; /* cached journal info */ unsigned char alloc_type; /* current allocation type */ unsigned int segno; /* current segment number */ unsigned short next_blkoff; /* next block offset to write */ unsigned int zone; /* current zone number */ unsigned int next_segno; /* preallocated segment */ }; struct sit_entry_set { struct list_head set_list; /* link with all sit sets */ unsigned int start_segno; /* start segno of sits in set */ unsigned int entry_cnt; /* the # of sit entries in set */ }; /* * inline functions */ static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type) { return (struct curseg_info *)(SM_I(sbi)->curseg_array + type); } static inline struct seg_entry *get_seg_entry(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); return &sit_i->sentries[segno]; } static inline struct sec_entry *get_sec_entry(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); return &sit_i->sec_entries[GET_SEC_FROM_SEG(sbi, segno)]; } static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi, unsigned int segno, bool use_section) { /* * In order to get # of valid blocks in a section instantly from many * segments, f2fs manages two counting structures separately. */ if (use_section && sbi->segs_per_sec > 1) return get_sec_entry(sbi, segno)->valid_blocks; else return get_seg_entry(sbi, segno)->valid_blocks; } static inline void seg_info_from_raw_sit(struct seg_entry *se, struct f2fs_sit_entry *rs) { se->valid_blocks = GET_SIT_VBLOCKS(rs); se->ckpt_valid_blocks = GET_SIT_VBLOCKS(rs); memcpy(se->cur_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); #ifdef CONFIG_F2FS_CHECK_FS memcpy(se->cur_valid_map_mir, rs->valid_map, SIT_VBLOCK_MAP_SIZE); #endif se->type = GET_SIT_TYPE(rs); se->mtime = le64_to_cpu(rs->mtime); } static inline void __seg_info_to_raw_sit(struct seg_entry *se, struct f2fs_sit_entry *rs) { unsigned short raw_vblocks = (se->type << SIT_VBLOCKS_SHIFT) | se->valid_blocks; rs->vblocks = cpu_to_le16(raw_vblocks); memcpy(rs->valid_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE); rs->mtime = cpu_to_le64(se->mtime); } static inline void seg_info_to_sit_page(struct f2fs_sb_info *sbi, struct page *page, unsigned int start) { struct f2fs_sit_block *raw_sit; struct seg_entry *se; struct f2fs_sit_entry *rs; unsigned int end = min(start + SIT_ENTRY_PER_BLOCK, (unsigned long)MAIN_SEGS(sbi)); int i; raw_sit = (struct f2fs_sit_block *)page_address(page); memset(raw_sit, 0, PAGE_SIZE); for (i = 0; i < end - start; i++) { rs = &raw_sit->entries[i]; se = get_seg_entry(sbi, start + i); __seg_info_to_raw_sit(se, rs); } } static inline void seg_info_to_raw_sit(struct seg_entry *se, struct f2fs_sit_entry *rs) { __seg_info_to_raw_sit(se, rs); memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); se->ckpt_valid_blocks = se->valid_blocks; } static inline unsigned int find_next_inuse(struct free_segmap_info *free_i, unsigned int max, unsigned int segno) { unsigned int ret; spin_lock(&free_i->segmap_lock); ret = find_next_bit(free_i->free_segmap, max, segno); spin_unlock(&free_i->segmap_lock); return ret; } static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; spin_lock(&free_i->segmap_lock); clear_bit(segno, free_i->free_segmap); free_i->free_segments++; next = find_next_bit(free_i->free_segmap, start_segno + sbi->segs_per_sec, start_segno); if (next >= start_segno + sbi->segs_per_sec) { clear_bit(secno, free_i->free_secmap); free_i->free_sections++; } spin_unlock(&free_i->segmap_lock); } static inline void __set_inuse(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); set_bit(segno, free_i->free_segmap); free_i->free_segments--; if (!test_and_set_bit(secno, free_i->free_secmap)) free_i->free_sections--; } static inline void __set_test_and_free(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; spin_lock(&free_i->segmap_lock); if (test_and_clear_bit(segno, free_i->free_segmap)) { free_i->free_segments++; if (IS_CURSEC(sbi, secno)) goto skip_free; next = find_next_bit(free_i->free_segmap, start_segno + sbi->segs_per_sec, start_segno); if (next >= start_segno + sbi->segs_per_sec) { if (test_and_clear_bit(secno, free_i->free_secmap)) free_i->free_sections++; } } skip_free: spin_unlock(&free_i->segmap_lock); } static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); spin_lock(&free_i->segmap_lock); if (!test_and_set_bit(segno, free_i->free_segmap)) { free_i->free_segments--; if (!test_and_set_bit(secno, free_i->free_secmap)) free_i->free_sections--; } spin_unlock(&free_i->segmap_lock); } static inline void get_sit_bitmap(struct f2fs_sb_info *sbi, void *dst_addr) { struct sit_info *sit_i = SIT_I(sbi); #ifdef CONFIG_F2FS_CHECK_FS if (memcmp(sit_i->sit_bitmap, sit_i->sit_bitmap_mir, sit_i->bitmap_size)) f2fs_bug_on(sbi, 1); #endif memcpy(dst_addr, sit_i->sit_bitmap, sit_i->bitmap_size); } static inline block_t written_block_count(struct f2fs_sb_info *sbi) { return SIT_I(sbi)->written_valid_blocks; } static inline unsigned int free_segments(struct f2fs_sb_info *sbi) { return FREE_I(sbi)->free_segments; } static inline int reserved_segments(struct f2fs_sb_info *sbi) { return SM_I(sbi)->reserved_segments; } static inline unsigned int free_sections(struct f2fs_sb_info *sbi) { return FREE_I(sbi)->free_sections; } static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi) { return DIRTY_I(sbi)->nr_dirty[PRE]; } static inline unsigned int dirty_segments(struct f2fs_sb_info *sbi) { return DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_DATA] + DIRTY_I(sbi)->nr_dirty[DIRTY_WARM_DATA] + DIRTY_I(sbi)->nr_dirty[DIRTY_COLD_DATA] + DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_NODE] + DIRTY_I(sbi)->nr_dirty[DIRTY_WARM_NODE] + DIRTY_I(sbi)->nr_dirty[DIRTY_COLD_NODE]; } static inline int overprovision_segments(struct f2fs_sb_info *sbi) { return SM_I(sbi)->ovp_segments; } static inline int reserved_sections(struct f2fs_sb_info *sbi) { return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi)); } static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, unsigned int node_blocks, unsigned int dent_blocks) { unsigned int segno, left_blocks; int i; /* check current node segment */ for (i = CURSEG_HOT_NODE; i <= CURSEG_COLD_NODE; i++) { segno = CURSEG_I(sbi, i)->segno; left_blocks = sbi->blocks_per_seg - get_seg_entry(sbi, segno)->ckpt_valid_blocks; if (node_blocks > left_blocks) return false; } /* check current data segment */ segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno; left_blocks = sbi->blocks_per_seg - get_seg_entry(sbi, segno)->ckpt_valid_blocks; if (dent_blocks > left_blocks) return false; return true; } static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed, int needed) { unsigned int total_node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) + get_pages(sbi, F2FS_DIRTY_DENTS) + get_pages(sbi, F2FS_DIRTY_IMETA); unsigned int total_dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS); unsigned int node_secs = total_node_blocks / BLKS_PER_SEC(sbi); unsigned int dent_secs = total_dent_blocks / BLKS_PER_SEC(sbi); unsigned int node_blocks = total_node_blocks % BLKS_PER_SEC(sbi); unsigned int dent_blocks = total_dent_blocks % BLKS_PER_SEC(sbi); unsigned int free, need_lower, need_upper; if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; free = free_sections(sbi) + freed; need_lower = node_secs + dent_secs + reserved_sections(sbi) + needed; need_upper = need_lower + (node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0); if (free > need_upper) return false; else if (free <= need_lower) return true; return !has_curseg_enough_space(sbi, node_blocks, dent_blocks); } static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi) { return prefree_segments(sbi) > SM_I(sbi)->rec_prefree_segments; } static inline int utilization(struct f2fs_sb_info *sbi) { return div_u64((u64)valid_user_blocks(sbi) * 100, sbi->user_block_count); } /* * Sometimes f2fs may be better to drop out-of-place update policy. * And, users can control the policy through sysfs entries. * There are five policies with triggering conditions as follows. * F2FS_IPU_FORCE - all the time, * F2FS_IPU_SSR - if SSR mode is activated, * F2FS_IPU_UTIL - if FS utilization is over threashold, * F2FS_IPU_SSR_UTIL - if SSR mode is activated and FS utilization is over * threashold, * F2FS_IPU_FSYNC - activated in fsync path only for high performance flash * storages. IPU will be triggered only if the # of dirty * pages over min_fsync_blocks. * F2FS_IPUT_DISABLE - disable IPU. (=default option) */ #define DEF_MIN_IPU_UTIL 70 #define DEF_MIN_FSYNC_BLOCKS 8 #define DEF_MIN_HOT_BLOCKS 16 #define SMALL_VOLUME_SEGMENTS (16 * 512) /* 16GB */ enum { F2FS_IPU_FORCE, F2FS_IPU_SSR, F2FS_IPU_UTIL, F2FS_IPU_SSR_UTIL, F2FS_IPU_FSYNC, F2FS_IPU_ASYNC, }; static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); return curseg->segno; } static inline unsigned char curseg_alloc_type(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); return curseg->alloc_type; } static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); return curseg->next_blkoff; } static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) { f2fs_bug_on(sbi, segno > TOTAL_SEGS(sbi) - 1); } static inline void verify_block_addr(struct f2fs_io_info *fio, block_t blk_addr) { struct f2fs_sb_info *sbi = fio->sbi; if (__is_meta_io(fio)) verify_blkaddr(sbi, blk_addr, META_GENERIC); else verify_blkaddr(sbi, blk_addr, DATA_GENERIC); } /* * Summary block is always treated as an invalid block */ static inline int check_block_count(struct f2fs_sb_info *sbi, int segno, struct f2fs_sit_entry *raw_sit) { bool is_valid = test_bit_le(0, raw_sit->valid_map) ? true : false; int valid_blocks = 0; int cur_pos = 0, next_pos; /* check bitmap with valid block count */ do { if (is_valid) { next_pos = find_next_zero_bit_le(&raw_sit->valid_map, sbi->blocks_per_seg, cur_pos); valid_blocks += next_pos - cur_pos; } else next_pos = find_next_bit_le(&raw_sit->valid_map, sbi->blocks_per_seg, cur_pos); cur_pos = next_pos; is_valid = !is_valid; } while (cur_pos < sbi->blocks_per_seg); if (unlikely(GET_SIT_VBLOCKS(raw_sit) != valid_blocks)) { f2fs_msg(sbi->sb, KERN_ERR, "Mismatch valid blocks %d vs. %d", GET_SIT_VBLOCKS(raw_sit), valid_blocks); set_sbi_flag(sbi, SBI_NEED_FSCK); return -EFSCORRUPTED; } /* check segment usage, and check boundary of a given segment number */ if (unlikely(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg || segno > TOTAL_SEGS(sbi) - 1)) { f2fs_msg(sbi->sb, KERN_ERR, "Wrong valid blocks %d or segno %u", GET_SIT_VBLOCKS(raw_sit), segno); set_sbi_flag(sbi, SBI_NEED_FSCK); return -EFSCORRUPTED; } return 0; } static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi, unsigned int start) { struct sit_info *sit_i = SIT_I(sbi); unsigned int offset = SIT_BLOCK_OFFSET(start); block_t blk_addr = sit_i->sit_base_addr + offset; check_seg_range(sbi, start); #ifdef CONFIG_F2FS_CHECK_FS if (f2fs_test_bit(offset, sit_i->sit_bitmap) != f2fs_test_bit(offset, sit_i->sit_bitmap_mir)) f2fs_bug_on(sbi, 1); #endif /* calculate sit block address */ if (f2fs_test_bit(offset, sit_i->sit_bitmap)) blk_addr += sit_i->sit_blocks; return blk_addr; } static inline pgoff_t next_sit_addr(struct f2fs_sb_info *sbi, pgoff_t block_addr) { struct sit_info *sit_i = SIT_I(sbi); block_addr -= sit_i->sit_base_addr; if (block_addr < sit_i->sit_blocks) block_addr += sit_i->sit_blocks; else block_addr -= sit_i->sit_blocks; return block_addr + sit_i->sit_base_addr; } static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start) { unsigned int block_off = SIT_BLOCK_OFFSET(start); f2fs_change_bit(block_off, sit_i->sit_bitmap); #ifdef CONFIG_F2FS_CHECK_FS f2fs_change_bit(block_off, sit_i->sit_bitmap_mir); #endif } static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi, bool base_time) { struct sit_info *sit_i = SIT_I(sbi); time64_t diff, now = ktime_get_real_seconds(); if (now >= sit_i->mounted_time) return sit_i->elapsed_time + now - sit_i->mounted_time; /* system time is set to the past */ if (!base_time) { diff = sit_i->mounted_time - now; if (sit_i->elapsed_time >= diff) return sit_i->elapsed_time - diff; return 0; } return sit_i->elapsed_time; } static inline void set_summary(struct f2fs_summary *sum, nid_t nid, unsigned int ofs_in_node, unsigned char version) { sum->nid = cpu_to_le32(nid); sum->ofs_in_node = cpu_to_le16(ofs_in_node); sum->version = version; } static inline block_t start_sum_block(struct f2fs_sb_info *sbi) { return __start_cp_addr(sbi) + le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); } static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) { return __start_cp_addr(sbi) + le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_total_block_count) - (base + 1) + type; } static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) { if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno)) return true; return false; } /* * It is very important to gather dirty pages and write at once, so that we can * submit a big bio without interfering other data writes. * By default, 512 pages for directory data, * 512 pages (2MB) * 8 for nodes, and * 256 pages * 8 for meta are set. */ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) { if (sbi->sb->s_bdi->wb.dirty_exceeded) return 0; if (type == DATA) return sbi->blocks_per_seg; else if (type == NODE) return 8 * sbi->blocks_per_seg; else if (type == META) return 8 * BIO_MAX_PAGES; else return 0; } /* * When writing pages, it'd better align nr_to_write for segment size. */ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, struct writeback_control *wbc) { long nr_to_write, desired; if (wbc->sync_mode != WB_SYNC_NONE) return 0; nr_to_write = wbc->nr_to_write; desired = BIO_MAX_PAGES; if (type == NODE) desired <<= 1; wbc->nr_to_write = desired; return desired - nr_to_write; } static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; bool wakeup = false; int i; if (force) goto wake_up; mutex_lock(&dcc->cmd_lock); for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { if (i + 1 < dcc->discard_granularity) break; if (!list_empty(&dcc->pend_list[i])) { wakeup = true; break; } } mutex_unlock(&dcc->cmd_lock); if (!wakeup) return; wake_up: dcc->discard_wake = 1; wake_up_interruptible_all(&dcc->discard_wait_queue); } |
59 59 40 40 40 59 80 59 40 40 59 59 59 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_IRQ_H #define _LINUX_IRQ_H /* * Please do not include this file in generic code. There is currently * no requirement for any architecture to implement anything held * within this file. * * Thanks. --rmk */ #include <linux/cache.h> #include <linux/spinlock.h> #include <linux/cpumask.h> #include <linux/irqhandler.h> #include <linux/irqreturn.h> #include <linux/irqnr.h> #include <linux/topology.h> #include <linux/io.h> #include <linux/slab.h> #include <asm/irq.h> #include <asm/ptrace.h> #include <asm/irq_regs.h> struct seq_file; struct module; struct msi_msg; enum irqchip_irq_state; /* * IRQ line status. * * Bits 0-7 are the same as the IRQF_* bits in linux/interrupt.h * * IRQ_TYPE_NONE - default, unspecified type * IRQ_TYPE_EDGE_RISING - rising edge triggered * IRQ_TYPE_EDGE_FALLING - falling edge triggered * IRQ_TYPE_EDGE_BOTH - rising and falling edge triggered * IRQ_TYPE_LEVEL_HIGH - high level triggered * IRQ_TYPE_LEVEL_LOW - low level triggered * IRQ_TYPE_LEVEL_MASK - Mask to filter out the level bits * IRQ_TYPE_SENSE_MASK - Mask for all the above bits * IRQ_TYPE_DEFAULT - For use by some PICs to ask irq_set_type * to setup the HW to a sane default (used * by irqdomain map() callbacks to synchronize * the HW state and SW flags for a newly * allocated descriptor). * * IRQ_TYPE_PROBE - Special flag for probing in progress * * Bits which can be modified via irq_set/clear/modify_status_flags() * IRQ_LEVEL - Interrupt is level type. Will be also * updated in the code when the above trigger * bits are modified via irq_set_irq_type() * IRQ_PER_CPU - Mark an interrupt PER_CPU. Will protect * it from affinity setting * IRQ_NOPROBE - Interrupt cannot be probed by autoprobing * IRQ_NOREQUEST - Interrupt cannot be requested via * request_irq() * IRQ_NOTHREAD - Interrupt cannot be threaded * IRQ_NOAUTOEN - Interrupt is not automatically enabled in * request/setup_irq() * IRQ_NO_BALANCING - Interrupt cannot be balanced (affinity set) * IRQ_MOVE_PCNTXT - Interrupt can be migrated from process context * IRQ_NESTED_THREAD - Interrupt nests into another thread * IRQ_PER_CPU_DEVID - Dev_id is a per-cpu variable * IRQ_IS_POLLED - Always polled by another interrupt. Exclude * it from the spurious interrupt detection * mechanism and from core side polling. * IRQ_DISABLE_UNLAZY - Disable lazy irq disable */ enum { IRQ_TYPE_NONE = 0x00000000, IRQ_TYPE_EDGE_RISING = 0x00000001, IRQ_TYPE_EDGE_FALLING = 0x00000002, IRQ_TYPE_EDGE_BOTH = (IRQ_TYPE_EDGE_FALLING | IRQ_TYPE_EDGE_RISING), IRQ_TYPE_LEVEL_HIGH = 0x00000004, IRQ_TYPE_LEVEL_LOW = 0x00000008, IRQ_TYPE_LEVEL_MASK = (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH), IRQ_TYPE_SENSE_MASK = 0x0000000f, IRQ_TYPE_DEFAULT = IRQ_TYPE_SENSE_MASK, IRQ_TYPE_PROBE = 0x00000010, IRQ_LEVEL = (1 << 8), IRQ_PER_CPU = (1 << 9), IRQ_NOPROBE = (1 << 10), IRQ_NOREQUEST = (1 << 11), IRQ_NOAUTOEN = (1 << 12), IRQ_NO_BALANCING = (1 << 13), IRQ_MOVE_PCNTXT = (1 << 14), IRQ_NESTED_THREAD = (1 << 15), IRQ_NOTHREAD = (1 << 16), IRQ_PER_CPU_DEVID = (1 << 17), IRQ_IS_POLLED = (1 << 18), IRQ_DISABLE_UNLAZY = (1 << 19), }; #define IRQF_MODIFY_MASK \ (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \ IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \ IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \ IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY) #define IRQ_NO_BALANCING_MASK (IRQ_PER_CPU | IRQ_NO_BALANCING) /* * Return value for chip->irq_set_affinity() * * IRQ_SET_MASK_OK - OK, core updates irq_common_data.affinity * IRQ_SET_MASK_NOCPY - OK, chip did update irq_common_data.affinity * IRQ_SET_MASK_OK_DONE - Same as IRQ_SET_MASK_OK for core. Special code to * support stacked irqchips, which indicates skipping * all descendent irqchips. */ enum { IRQ_SET_MASK_OK = 0, IRQ_SET_MASK_OK_NOCOPY, IRQ_SET_MASK_OK_DONE, }; struct msi_desc; struct irq_domain; /** * struct irq_common_data - per irq data shared by all irqchips * @state_use_accessors: status information for irq chip functions. * Use accessor functions to deal with it * @node: node index useful for balancing * @handler_data: per-IRQ data for the irq_chip methods * @affinity: IRQ affinity on SMP. If this is an IPI * related irq, then this is the mask of the * CPUs to which an IPI can be sent. * @effective_affinity: The effective IRQ affinity on SMP as some irq * chips do not allow multi CPU destinations. * A subset of @affinity. * @msi_desc: MSI descriptor * @ipi_offset: Offset of first IPI target cpu in @affinity. Optional. */ struct irq_common_data { unsigned int __private state_use_accessors; #ifdef CONFIG_NUMA unsigned int node; #endif void *handler_data; struct msi_desc *msi_desc; cpumask_var_t affinity; #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK cpumask_var_t effective_affinity; #endif #ifdef CONFIG_GENERIC_IRQ_IPI unsigned int ipi_offset; #endif }; /** * struct irq_data - per irq chip data passed down to chip functions * @mask: precomputed bitmask for accessing the chip registers * @irq: interrupt number * @hwirq: hardware interrupt number, local to the interrupt domain * @common: point to data shared by all irqchips * @chip: low level interrupt hardware access * @domain: Interrupt translation domain; responsible for mapping * between hwirq number and linux irq number. * @parent_data: pointer to parent struct irq_data to support hierarchy * irq_domain * @chip_data: platform-specific per-chip private data for the chip * methods, to allow shared chip implementations */ struct irq_data { u32 mask; unsigned int irq; unsigned long hwirq; struct irq_common_data *common; struct irq_chip *chip; struct irq_domain *domain; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY struct irq_data *parent_data; #endif void *chip_data; }; /* * Bit masks for irq_common_data.state_use_accessors * * IRQD_TRIGGER_MASK - Mask for the trigger type bits * IRQD_SETAFFINITY_PENDING - Affinity setting is pending * IRQD_ACTIVATED - Interrupt has already been activated * IRQD_NO_BALANCING - Balancing disabled for this IRQ * IRQD_PER_CPU - Interrupt is per cpu * IRQD_AFFINITY_SET - Interrupt affinity was set * IRQD_LEVEL - Interrupt is level triggered * IRQD_WAKEUP_STATE - Interrupt is configured for wakeup * from suspend * IRDQ_MOVE_PCNTXT - Interrupt can be moved in process * context * IRQD_IRQ_DISABLED - Disabled state of the interrupt * IRQD_IRQ_MASKED - Masked state of the interrupt * IRQD_IRQ_INPROGRESS - In progress state of the interrupt * IRQD_WAKEUP_ARMED - Wakeup mode armed * IRQD_FORWARDED_TO_VCPU - The interrupt is forwarded to a VCPU * IRQD_AFFINITY_MANAGED - Affinity is auto-managed by the kernel * IRQD_IRQ_STARTED - Startup state of the interrupt * IRQD_MANAGED_SHUTDOWN - Interrupt was shutdown due to empty affinity * mask. Applies only to affinity managed irqs. * IRQD_SINGLE_TARGET - IRQ allows only a single affinity target * IRQD_DEFAULT_TRIGGER_SET - Expected trigger already been set * IRQD_CAN_RESERVE - Can use reservation mode * IRQD_MSI_NOMASK_QUIRK - Non-maskable MSI quirk for affinity change * required * IRQD_AFFINITY_ON_ACTIVATE - Affinity is set on activation. Don't call * irq_chip::irq_set_affinity() when deactivated. */ enum { IRQD_TRIGGER_MASK = 0xf, IRQD_SETAFFINITY_PENDING = (1 << 8), IRQD_ACTIVATED = (1 << 9), IRQD_NO_BALANCING = (1 << 10), IRQD_PER_CPU = (1 << 11), IRQD_AFFINITY_SET = (1 << 12), IRQD_LEVEL = (1 << 13), IRQD_WAKEUP_STATE = (1 << 14), IRQD_MOVE_PCNTXT = (1 << 15), IRQD_IRQ_DISABLED = (1 << 16), IRQD_IRQ_MASKED = (1 << 17), IRQD_IRQ_INPROGRESS = (1 << 18), IRQD_WAKEUP_ARMED = (1 << 19), IRQD_FORWARDED_TO_VCPU = (1 << 20), IRQD_AFFINITY_MANAGED = (1 << 21), IRQD_IRQ_STARTED = (1 << 22), IRQD_MANAGED_SHUTDOWN = (1 << 23), IRQD_SINGLE_TARGET = (1 << 24), IRQD_DEFAULT_TRIGGER_SET = (1 << 25), IRQD_CAN_RESERVE = (1 << 26), IRQD_MSI_NOMASK_QUIRK = (1 << 27), IRQD_AFFINITY_ON_ACTIVATE = (1 << 29), }; #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) static inline bool irqd_is_setaffinity_pending(struct irq_data *d) { return __irqd_to_state(d) & IRQD_SETAFFINITY_PENDING; } static inline bool irqd_is_per_cpu(struct irq_data *d) { return __irqd_to_state(d) & IRQD_PER_CPU; } static inline bool irqd_can_balance(struct irq_data *d) { return !(__irqd_to_state(d) & (IRQD_PER_CPU | IRQD_NO_BALANCING)); } static inline bool irqd_affinity_was_set(struct irq_data *d) { return __irqd_to_state(d) & IRQD_AFFINITY_SET; } static inline void irqd_mark_affinity_was_set(struct irq_data *d) { __irqd_to_state(d) |= IRQD_AFFINITY_SET; } static inline bool irqd_trigger_type_was_set(struct irq_data *d) { return __irqd_to_state(d) & IRQD_DEFAULT_TRIGGER_SET; } static inline u32 irqd_get_trigger_type(struct irq_data *d) { return __irqd_to_state(d) & IRQD_TRIGGER_MASK; } /* * Must only be called inside irq_chip.irq_set_type() functions or * from the DT/ACPI setup code. */ static inline void irqd_set_trigger_type(struct irq_data *d, u32 type) { __irqd_to_state(d) &= ~IRQD_TRIGGER_MASK; __irqd_to_state(d) |= type & IRQD_TRIGGER_MASK; __irqd_to_state(d) |= IRQD_DEFAULT_TRIGGER_SET; } static inline bool irqd_is_level_type(struct irq_data *d) { return __irqd_to_state(d) & IRQD_LEVEL; } /* * Must only be called of irqchip.irq_set_affinity() or low level * hieararchy domain allocation functions. */ static inline void irqd_set_single_target(struct irq_data *d) { __irqd_to_state(d) |= IRQD_SINGLE_TARGET; } static inline bool irqd_is_single_target(struct irq_data *d) { return __irqd_to_state(d) & IRQD_SINGLE_TARGET; } static inline bool irqd_is_wakeup_set(struct irq_data *d) { return __irqd_to_state(d) & IRQD_WAKEUP_STATE; } static inline bool irqd_can_move_in_process_context(struct irq_data *d) { return __irqd_to_state(d) & IRQD_MOVE_PCNTXT; } static inline bool irqd_irq_disabled(struct irq_data *d) { return __irqd_to_state(d) & IRQD_IRQ_DISABLED; } static inline bool irqd_irq_masked(struct irq_data *d) { return __irqd_to_state(d) & IRQD_IRQ_MASKED; } static inline bool irqd_irq_inprogress(struct irq_data *d) { return __irqd_to_state(d) & IRQD_IRQ_INPROGRESS; } static inline bool irqd_is_wakeup_armed(struct irq_data *d) { return __irqd_to_state(d) & IRQD_WAKEUP_ARMED; } static inline bool irqd_is_forwarded_to_vcpu(struct irq_data *d) { return __irqd_to_state(d) & IRQD_FORWARDED_TO_VCPU; } static inline void irqd_set_forwarded_to_vcpu(struct irq_data *d) { __irqd_to_state(d) |= IRQD_FORWARDED_TO_VCPU; } static inline void irqd_clr_forwarded_to_vcpu(struct irq_data *d) { __irqd_to_state(d) &= ~IRQD_FORWARDED_TO_VCPU; } static inline bool irqd_affinity_is_managed(struct irq_data *d) { return __irqd_to_state(d) & IRQD_AFFINITY_MANAGED; } static inline bool irqd_is_activated(struct irq_data *d) { return __irqd_to_state(d) & IRQD_ACTIVATED; } static inline void irqd_set_activated(struct irq_data *d) { __irqd_to_state(d) |= IRQD_ACTIVATED; } static inline void irqd_clr_activated(struct irq_data *d) { __irqd_to_state(d) &= ~IRQD_ACTIVATED; } static inline bool irqd_is_started(struct irq_data *d) { return __irqd_to_state(d) & IRQD_IRQ_STARTED; } static inline bool irqd_is_managed_and_shutdown(struct irq_data *d) { return __irqd_to_state(d) & IRQD_MANAGED_SHUTDOWN; } static inline void irqd_set_can_reserve(struct irq_data *d) { __irqd_to_state(d) |= IRQD_CAN_RESERVE; } static inline void irqd_clr_can_reserve(struct irq_data *d) { __irqd_to_state(d) &= ~IRQD_CAN_RESERVE; } static inline bool irqd_can_reserve(struct irq_data *d) { return __irqd_to_state(d) & IRQD_CAN_RESERVE; } static inline void irqd_set_msi_nomask_quirk(struct irq_data *d) { __irqd_to_state(d) |= IRQD_MSI_NOMASK_QUIRK; } static inline void irqd_clr_msi_nomask_quirk(struct irq_data *d) { __irqd_to_state(d) &= ~IRQD_MSI_NOMASK_QUIRK; } static inline bool irqd_msi_nomask_quirk(struct irq_data *d) { return __irqd_to_state(d) & IRQD_MSI_NOMASK_QUIRK; } static inline void irqd_set_affinity_on_activate(struct irq_data *d) { __irqd_to_state(d) |= IRQD_AFFINITY_ON_ACTIVATE; } static inline bool irqd_affinity_on_activate(struct irq_data *d) { return __irqd_to_state(d) & IRQD_AFFINITY_ON_ACTIVATE; } #undef __irqd_to_state static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) { return d->hwirq; } /** * struct irq_chip - hardware interrupt chip descriptor * * @parent_device: pointer to parent device for irqchip * @name: name for /proc/interrupts * @irq_startup: start up the interrupt (defaults to ->enable if NULL) * @irq_shutdown: shut down the interrupt (defaults to ->disable if NULL) * @irq_enable: enable the interrupt (defaults to chip->unmask if NULL) * @irq_disable: disable the interrupt * @irq_ack: start of a new interrupt * @irq_mask: mask an interrupt source * @irq_mask_ack: ack and mask an interrupt source * @irq_unmask: unmask an interrupt source * @irq_eoi: end of interrupt * @irq_set_affinity: Set the CPU affinity on SMP machines. If the force * argument is true, it tells the driver to * unconditionally apply the affinity setting. Sanity * checks against the supplied affinity mask are not * required. This is used for CPU hotplug where the * target CPU is not yet set in the cpu_online_mask. * @irq_retrigger: resend an IRQ to the CPU * @irq_set_type: set the flow type (IRQ_TYPE_LEVEL/etc.) of an IRQ * @irq_set_wake: enable/disable power-management wake-on of an IRQ * @irq_bus_lock: function to lock access to slow bus (i2c) chips * @irq_bus_sync_unlock:function to sync and unlock slow bus (i2c) chips * @irq_cpu_online: configure an interrupt source for a secondary CPU * @irq_cpu_offline: un-configure an interrupt source for a secondary CPU * @irq_suspend: function called from core code on suspend once per * chip, when one or more interrupts are installed * @irq_resume: function called from core code on resume once per chip, * when one ore more interrupts are installed * @irq_pm_shutdown: function called from core code on shutdown once per chip * @irq_calc_mask: Optional function to set irq_data.mask for special cases * @irq_print_chip: optional to print special chip info in show_interrupts * @irq_request_resources: optional to request resources before calling * any other callback related to this irq * @irq_release_resources: optional to release resources acquired with * irq_request_resources * @irq_compose_msi_msg: optional to compose message content for MSI * @irq_write_msi_msg: optional to write message content for MSI * @irq_get_irqchip_state: return the internal state of an interrupt * @irq_set_irqchip_state: set the internal state of a interrupt * @irq_set_vcpu_affinity: optional to target a vCPU in a virtual machine * @ipi_send_single: send a single IPI to destination cpus * @ipi_send_mask: send an IPI to destination cpus in cpumask * @flags: chip specific flags */ struct irq_chip { struct device *parent_device; const char *name; unsigned int (*irq_startup)(struct irq_data *data); void (*irq_shutdown)(struct irq_data *data); void (*irq_enable)(struct irq_data *data); void (*irq_disable)(struct irq_data *data); void (*irq_ack)(struct irq_data *data); void (*irq_mask)(struct irq_data *data); void (*irq_mask_ack)(struct irq_data *data); void (*irq_unmask)(struct irq_data *data); void (*irq_eoi)(struct irq_data *data); int (*irq_set_affinity)(struct irq_data *data, const struct cpumask *dest, bool force); int (*irq_retrigger)(struct irq_data *data); int (*irq_set_type)(struct irq_data *data, unsigned int flow_type); int (*irq_set_wake)(struct irq_data *data, unsigned int on); void (*irq_bus_lock)(struct irq_data *data); void (*irq_bus_sync_unlock)(struct irq_data *data); void (*irq_cpu_online)(struct irq_data *data); void (*irq_cpu_offline)(struct irq_data *data); void (*irq_suspend)(struct irq_data *data); void (*irq_resume)(struct irq_data *data); void (*irq_pm_shutdown)(struct irq_data *data); void (*irq_calc_mask)(struct irq_data *data); void (*irq_print_chip)(struct irq_data *data, struct seq_file *p); int (*irq_request_resources)(struct irq_data *data); void (*irq_release_resources)(struct irq_data *data); void (*irq_compose_msi_msg)(struct irq_data *data, struct msi_msg *msg); void (*irq_write_msi_msg)(struct irq_data *data, struct msi_msg *msg); int (*irq_get_irqchip_state)(struct irq_data *data, enum irqchip_irq_state which, bool *state); int (*irq_set_irqchip_state)(struct irq_data *data, enum irqchip_irq_state which, bool state); int (*irq_set_vcpu_affinity)(struct irq_data *data, void *vcpu_info); void (*ipi_send_single)(struct irq_data *data, unsigned int cpu); void (*ipi_send_mask)(struct irq_data *data, const struct cpumask *dest); unsigned long flags; }; /* * irq_chip specific flags * * IRQCHIP_SET_TYPE_MASKED: Mask before calling chip.irq_set_type() * IRQCHIP_EOI_IF_HANDLED: Only issue irq_eoi() when irq was handled * IRQCHIP_MASK_ON_SUSPEND: Mask non wake irqs in the suspend path * IRQCHIP_ONOFFLINE_ENABLED: Only call irq_on/off_line callbacks * when irq enabled * IRQCHIP_SKIP_SET_WAKE: Skip chip.irq_set_wake(), for this irq chip * IRQCHIP_ONESHOT_SAFE: One shot does not require mask/unmask * IRQCHIP_EOI_THREADED: Chip requires eoi() on unmask in threaded mode * IRQCHIP_SUPPORTS_LEVEL_MSI Chip can provide two doorbells for Level MSIs * IRQCHIP_AFFINITY_PRE_STARTUP: Default affinity update before startup */ enum { IRQCHIP_SET_TYPE_MASKED = (1 << 0), IRQCHIP_EOI_IF_HANDLED = (1 << 1), IRQCHIP_MASK_ON_SUSPEND = (1 << 2), IRQCHIP_ONOFFLINE_ENABLED = (1 << 3), IRQCHIP_SKIP_SET_WAKE = (1 << 4), IRQCHIP_ONESHOT_SAFE = (1 << 5), IRQCHIP_EOI_THREADED = (1 << 6), IRQCHIP_SUPPORTS_LEVEL_MSI = (1 << 7), IRQCHIP_AFFINITY_PRE_STARTUP = (1 << 10), }; #include <linux/irqdesc.h> /* * Pick up the arch-dependent methods: */ #include <asm/hw_irq.h> #ifndef NR_IRQS_LEGACY # define NR_IRQS_LEGACY 0 #endif #ifndef ARCH_IRQ_INIT_FLAGS # define ARCH_IRQ_INIT_FLAGS 0 #endif #define IRQ_DEFAULT_INIT_FLAGS ARCH_IRQ_INIT_FLAGS struct irqaction; extern int setup_irq(unsigned int irq, struct irqaction *new); extern void remove_irq(unsigned int irq, struct irqaction *act); extern int setup_percpu_irq(unsigned int irq, struct irqaction *new); extern void remove_percpu_irq(unsigned int irq, struct irqaction *act); extern void irq_cpu_online(void); extern void irq_cpu_offline(void); extern int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *cpumask, bool force); extern int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info); #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_IRQ_MIGRATION) extern void irq_migrate_all_off_this_cpu(void); extern int irq_affinity_online_cpu(unsigned int cpu); #else # define irq_affinity_online_cpu NULL #endif #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ) void __irq_move_irq(struct irq_data *data); static inline void irq_move_irq(struct irq_data *data) { if (unlikely(irqd_is_setaffinity_pending(data))) __irq_move_irq(data); } void irq_move_masked_irq(struct irq_data *data); void irq_force_complete_move(struct irq_desc *desc); #else static inline void irq_move_irq(struct irq_data *data) { } static inline void irq_move_masked_irq(struct irq_data *data) { } static inline void irq_force_complete_move(struct irq_desc *desc) { } #endif extern int no_irq_affinity; #ifdef CONFIG_HARDIRQS_SW_RESEND int irq_set_parent(int irq, int parent_irq); #else static inline int irq_set_parent(int irq, int parent_irq) { return 0; } #endif /* * Built-in IRQ handlers for various IRQ types, * callable via desc->handle_irq() */ extern void handle_level_irq(struct irq_desc *desc); extern void handle_fasteoi_irq(struct irq_desc *desc); extern void handle_edge_irq(struct irq_desc *desc); extern void handle_edge_eoi_irq(struct irq_desc *desc); extern void handle_simple_irq(struct irq_desc *desc); extern void handle_untracked_irq(struct irq_desc *desc); extern void handle_percpu_irq(struct irq_desc *desc); extern void handle_percpu_devid_irq(struct irq_desc *desc); extern void handle_bad_irq(struct irq_desc *desc); extern void handle_nested_irq(unsigned int irq); extern int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg); extern int irq_chip_pm_get(struct irq_data *data); extern int irq_chip_pm_put(struct irq_data *data); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY extern void handle_fasteoi_ack_irq(struct irq_desc *desc); extern void handle_fasteoi_mask_irq(struct irq_desc *desc); extern void irq_chip_enable_parent(struct irq_data *data); extern void irq_chip_disable_parent(struct irq_data *data); extern void irq_chip_ack_parent(struct irq_data *data); extern int irq_chip_retrigger_hierarchy(struct irq_data *data); extern void irq_chip_mask_parent(struct irq_data *data); extern void irq_chip_unmask_parent(struct irq_data *data); extern void irq_chip_eoi_parent(struct irq_data *data); extern int irq_chip_set_affinity_parent(struct irq_data *data, const struct cpumask *dest, bool force); extern int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on); extern int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info); extern int irq_chip_set_type_parent(struct irq_data *data, unsigned int type); #endif /* Handling of unhandled and spurious interrupts: */ extern void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret); /* Enable/disable irq debugging output: */ extern int noirqdebug_setup(char *str); /* Checks whether the interrupt can be requested by request_irq(): */ extern int can_request_irq(unsigned int irq, unsigned long irqflags); /* Dummy irq-chip implementations: */ extern struct irq_chip no_irq_chip; extern struct irq_chip dummy_irq_chip; extern void irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, irq_flow_handler_t handle, const char *name); static inline void irq_set_chip_and_handler(unsigned int irq, struct irq_chip *chip, irq_flow_handler_t handle) { irq_set_chip_and_handler_name(irq, chip, handle, NULL); } extern int irq_set_percpu_devid(unsigned int irq); extern int irq_set_percpu_devid_partition(unsigned int irq, const struct cpumask *affinity); extern int irq_get_percpu_devid_partition(unsigned int irq, struct cpumask *affinity); extern void __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, const char *name); static inline void irq_set_handler(unsigned int irq, irq_flow_handler_t handle) { __irq_set_handler(irq, handle, 0, NULL); } /* * Set a highlevel chained flow handler for a given IRQ. * (a chained handler is automatically enabled and set to * IRQ_NOREQUEST, IRQ_NOPROBE, and IRQ_NOTHREAD) */ static inline void irq_set_chained_handler(unsigned int irq, irq_flow_handler_t handle) { __irq_set_handler(irq, handle, 1, NULL); } /* * Set a highlevel chained flow handler and its data for a given IRQ. * (a chained handler is automatically enabled and set to * IRQ_NOREQUEST, IRQ_NOPROBE, and IRQ_NOTHREAD) */ void irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle, void *data); void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set); static inline void irq_set_status_flags(unsigned int irq, unsigned long set) { irq_modify_status(irq, 0, set); } static inline void irq_clear_status_flags(unsigned int irq, unsigned long clr) { irq_modify_status(irq, clr, 0); } static inline void irq_set_noprobe(unsigned int irq) { irq_modify_status(irq, 0, IRQ_NOPROBE); } static inline void irq_set_probe(unsigned int irq) { irq_modify_status(irq, IRQ_NOPROBE, 0); } static inline void irq_set_nothread(unsigned int irq) { irq_modify_status(irq, 0, IRQ_NOTHREAD); } static inline void irq_set_thread(unsigned int irq) { irq_modify_status(irq, IRQ_NOTHREAD, 0); } static inline void irq_set_nested_thread(unsigned int irq, bool nest) { if (nest) irq_set_status_flags(irq, IRQ_NESTED_THREAD); else irq_clear_status_flags(irq, IRQ_NESTED_THREAD); } static inline void irq_set_percpu_devid_flags(unsigned int irq) { irq_set_status_flags(irq, IRQ_NOAUTOEN | IRQ_PER_CPU | IRQ_NOTHREAD | IRQ_NOPROBE | IRQ_PER_CPU_DEVID); } /* Set/get chip/data for an IRQ: */ extern int irq_set_chip(unsigned int irq, struct irq_chip *chip); extern int irq_set_handler_data(unsigned int irq, void *data); extern int irq_set_chip_data(unsigned int irq, void *data); extern int irq_set_irq_type(unsigned int irq, unsigned int type); extern int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry); extern int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset, struct msi_desc *entry); extern struct irq_data *irq_get_irq_data(unsigned int irq); static inline struct irq_chip *irq_get_chip(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? d->chip : NULL; } static inline struct irq_chip *irq_data_get_irq_chip(struct irq_data *d) { return d->chip; } static inline void *irq_get_chip_data(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? d->chip_data : NULL; } static inline void *irq_data_get_irq_chip_data(struct irq_data *d) { return d->chip_data; } static inline void *irq_get_handler_data(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? d->common->handler_data : NULL; } static inline void *irq_data_get_irq_handler_data(struct irq_data *d) { return d->common->handler_data; } static inline struct msi_desc *irq_get_msi_desc(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? d->common->msi_desc : NULL; } static inline struct msi_desc *irq_data_get_msi_desc(struct irq_data *d) { return d->common->msi_desc; } static inline u32 irq_get_trigger_type(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? irqd_get_trigger_type(d) : 0; } static inline int irq_common_data_get_node(struct irq_common_data *d) { #ifdef CONFIG_NUMA return d->node; #else return 0; #endif } static inline int irq_data_get_node(struct irq_data *d) { return irq_common_data_get_node(d->common); } static inline struct cpumask *irq_get_affinity_mask(int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? d->common->affinity : NULL; } static inline struct cpumask *irq_data_get_affinity_mask(struct irq_data *d) { return d->common->affinity; } #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK static inline struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d) { return d->common->effective_affinity; } static inline void irq_data_update_effective_affinity(struct irq_data *d, const struct cpumask *m) { cpumask_copy(d->common->effective_affinity, m); } #else static inline void irq_data_update_effective_affinity(struct irq_data *d, const struct cpumask *m) { } static inline struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d) { return d->common->affinity; } #endif unsigned int arch_dynirq_lower_bound(unsigned int from); int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, struct module *owner, const struct cpumask *affinity); int __devm_irq_alloc_descs(struct device *dev, int irq, unsigned int from, unsigned int cnt, int node, struct module *owner, const struct cpumask *affinity); /* use macros to avoid needing export.h for THIS_MODULE */ #define irq_alloc_descs(irq, from, cnt, node) \ __irq_alloc_descs(irq, from, cnt, node, THIS_MODULE, NULL) #define irq_alloc_desc(node) \ irq_alloc_descs(-1, 0, 1, node) #define irq_alloc_desc_at(at, node) \ irq_alloc_descs(at, at, 1, node) #define irq_alloc_desc_from(from, node) \ irq_alloc_descs(-1, from, 1, node) #define irq_alloc_descs_from(from, cnt, node) \ irq_alloc_descs(-1, from, cnt, node) #define devm_irq_alloc_descs(dev, irq, from, cnt, node) \ __devm_irq_alloc_descs(dev, irq, from, cnt, node, THIS_MODULE, NULL) #define devm_irq_alloc_desc(dev, node) \ devm_irq_alloc_descs(dev, -1, 0, 1, node) #define devm_irq_alloc_desc_at(dev, at, node) \ devm_irq_alloc_descs(dev, at, at, 1, node) #define devm_irq_alloc_desc_from(dev, from, node) \ devm_irq_alloc_descs(dev, -1, from, 1, node) #define devm_irq_alloc_descs_from(dev, from, cnt, node) \ devm_irq_alloc_descs(dev, -1, from, cnt, node) void irq_free_descs(unsigned int irq, unsigned int cnt); static inline void irq_free_desc(unsigned int irq) { irq_free_descs(irq, 1); } #ifdef CONFIG_GENERIC_IRQ_LEGACY_ALLOC_HWIRQ unsigned int irq_alloc_hwirqs(int cnt, int node); static inline unsigned int irq_alloc_hwirq(int node) { return irq_alloc_hwirqs(1, node); } void irq_free_hwirqs(unsigned int from, int cnt); static inline void irq_free_hwirq(unsigned int irq) { return irq_free_hwirqs(irq, 1); } int arch_setup_hwirq(unsigned int irq, int node); void arch_teardown_hwirq(unsigned int irq); #endif #ifdef CONFIG_GENERIC_IRQ_LEGACY void irq_init_desc(unsigned int irq); #endif /** * struct irq_chip_regs - register offsets for struct irq_gci * @enable: Enable register offset to reg_base * @disable: Disable register offset to reg_base * @mask: Mask register offset to reg_base * @ack: Ack register offset to reg_base * @eoi: Eoi register offset to reg_base * @type: Type configuration register offset to reg_base * @polarity: Polarity configuration register offset to reg_base */ struct irq_chip_regs { unsigned long enable; unsigned long disable; unsigned long mask; unsigned long ack; unsigned long eoi; unsigned long type; unsigned long polarity; }; /** * struct irq_chip_type - Generic interrupt chip instance for a flow type * @chip: The real interrupt chip which provides the callbacks * @regs: Register offsets for this chip * @handler: Flow handler associated with this chip * @type: Chip can handle these flow types * @mask_cache_priv: Cached mask register private to the chip type * @mask_cache: Pointer to cached mask register * * A irq_generic_chip can have several instances of irq_chip_type when * it requires different functions and register offsets for different * flow types. */ struct irq_chip_type { struct irq_chip chip; struct irq_chip_regs regs; irq_flow_handler_t handler; u32 type; u32 mask_cache_priv; u32 *mask_cache; }; /** * struct irq_chip_generic - Generic irq chip data structure * @lock: Lock to protect register and cache data access * @reg_base: Register base address (virtual) * @reg_readl: Alternate I/O accessor (defaults to readl if NULL) * @reg_writel: Alternate I/O accessor (defaults to writel if NULL) * @suspend: Function called from core code on suspend once per * chip; can be useful instead of irq_chip::suspend to * handle chip details even when no interrupts are in use * @resume: Function called from core code on resume once per chip; * can be useful instead of irq_chip::suspend to handle * chip details even when no interrupts are in use * @irq_base: Interrupt base nr for this chip * @irq_cnt: Number of interrupts handled by this chip * @mask_cache: Cached mask register shared between all chip types * @type_cache: Cached type register * @polarity_cache: Cached polarity register * @wake_enabled: Interrupt can wakeup from suspend * @wake_active: Interrupt is marked as an wakeup from suspend source * @num_ct: Number of available irq_chip_type instances (usually 1) * @private: Private data for non generic chip callbacks * @installed: bitfield to denote installed interrupts * @unused: bitfield to denote unused interrupts * @domain: irq domain pointer * @list: List head for keeping track of instances * @chip_types: Array of interrupt irq_chip_types * * Note, that irq_chip_generic can have multiple irq_chip_type * implementations which can be associated to a particular irq line of * an irq_chip_generic instance. That allows to share and protect * state in an irq_chip_generic instance when we need to implement * different flow mechanisms (level/edge) for it. */ struct irq_chip_generic { raw_spinlock_t lock; void __iomem *reg_base; u32 (*reg_readl)(void __iomem *addr); void (*reg_writel)(u32 val, void __iomem *addr); void (*suspend)(struct irq_chip_generic *gc); void (*resume)(struct irq_chip_generic *gc); unsigned int irq_base; unsigned int irq_cnt; u32 mask_cache; u32 type_cache; u32 polarity_cache; u32 wake_enabled; u32 wake_active; unsigned int num_ct; void *private; unsigned long installed; unsigned long unused; struct irq_domain *domain; struct list_head list; struct irq_chip_type chip_types[0]; }; /** * enum irq_gc_flags - Initialization flags for generic irq chips * @IRQ_GC_INIT_MASK_CACHE: Initialize the mask_cache by reading mask reg * @IRQ_GC_INIT_NESTED_LOCK: Set the lock class of the irqs to nested for * irq chips which need to call irq_set_wake() on * the parent irq. Usually GPIO implementations * @IRQ_GC_MASK_CACHE_PER_TYPE: Mask cache is chip type private * @IRQ_GC_NO_MASK: Do not calculate irq_data->mask * @IRQ_GC_BE_IO: Use big-endian register accesses (default: LE) */ enum irq_gc_flags { IRQ_GC_INIT_MASK_CACHE = 1 << 0, IRQ_GC_INIT_NESTED_LOCK = 1 << 1, IRQ_GC_MASK_CACHE_PER_TYPE = 1 << 2, IRQ_GC_NO_MASK = 1 << 3, IRQ_GC_BE_IO = 1 << 4, }; /* * struct irq_domain_chip_generic - Generic irq chip data structure for irq domains * @irqs_per_chip: Number of interrupts per chip * @num_chips: Number of chips * @irq_flags_to_set: IRQ* flags to set on irq setup * @irq_flags_to_clear: IRQ* flags to clear on irq setup * @gc_flags: Generic chip specific setup flags * @gc: Array of pointers to generic interrupt chips */ struct irq_domain_chip_generic { unsigned int irqs_per_chip; unsigned int num_chips; unsigned int irq_flags_to_clear; unsigned int irq_flags_to_set; enum irq_gc_flags gc_flags; struct irq_chip_generic *gc[0]; }; /* Generic chip callback functions */ void irq_gc_noop(struct irq_data *d); void irq_gc_mask_disable_reg(struct irq_data *d); void irq_gc_mask_set_bit(struct irq_data *d); void irq_gc_mask_clr_bit(struct irq_data *d); void irq_gc_unmask_enable_reg(struct irq_data *d); void irq_gc_ack_set_bit(struct irq_data *d); void irq_gc_ack_clr_bit(struct irq_data *d); void irq_gc_mask_disable_and_ack_set(struct irq_data *d); void irq_gc_eoi(struct irq_data *d); int irq_gc_set_wake(struct irq_data *d, unsigned int on); /* Setup functions for irq_chip_generic */ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, irq_hw_number_t hw_irq); struct irq_chip_generic * irq_alloc_generic_chip(const char *name, int nr_ct, unsigned int irq_base, void __iomem *reg_base, irq_flow_handler_t handler); void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, enum irq_gc_flags flags, unsigned int clr, unsigned int set); int irq_setup_alt_chip(struct irq_data *d, unsigned int type); void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, unsigned int clr, unsigned int set); struct irq_chip_generic * devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct, unsigned int irq_base, void __iomem *reg_base, irq_flow_handler_t handler); int devm_irq_setup_generic_chip(struct device *dev, struct irq_chip_generic *gc, u32 msk, enum irq_gc_flags flags, unsigned int clr, unsigned int set); struct irq_chip_generic *irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq); int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, int num_ct, const char *name, irq_flow_handler_t handler, unsigned int clr, unsigned int set, enum irq_gc_flags flags); #define irq_alloc_domain_generic_chips(d, irqs_per_chip, num_ct, name, \ handler, clr, set, flags) \ ({ \ MAYBE_BUILD_BUG_ON(irqs_per_chip > 32); \ __irq_alloc_domain_generic_chips(d, irqs_per_chip, num_ct, name,\ handler, clr, set, flags); \ }) static inline void irq_free_generic_chip(struct irq_chip_generic *gc) { kfree(gc); } static inline void irq_destroy_generic_chip(struct irq_chip_generic *gc, u32 msk, unsigned int clr, unsigned int set) { irq_remove_generic_chip(gc, msk, clr, set); irq_free_generic_chip(gc); } static inline struct irq_chip_type *irq_data_get_chip_type(struct irq_data *d) { return container_of(d->chip, struct irq_chip_type, chip); } #define IRQ_MSK(n) (u32)((n) < 32 ? ((1 << (n)) - 1) : UINT_MAX) #ifdef CONFIG_SMP static inline void irq_gc_lock(struct irq_chip_generic *gc) { raw_spin_lock(&gc->lock); } static inline void irq_gc_unlock(struct irq_chip_generic *gc) { raw_spin_unlock(&gc->lock); } #else static inline void irq_gc_lock(struct irq_chip_generic *gc) { } static inline void irq_gc_unlock(struct irq_chip_generic *gc) { } #endif /* * The irqsave variants are for usage in non interrupt code. Do not use * them in irq_chip callbacks. Use irq_gc_lock() instead. */ #define irq_gc_lock_irqsave(gc, flags) \ raw_spin_lock_irqsave(&(gc)->lock, flags) #define irq_gc_unlock_irqrestore(gc, flags) \ raw_spin_unlock_irqrestore(&(gc)->lock, flags) static inline void irq_reg_writel(struct irq_chip_generic *gc, u32 val, int reg_offset) { if (gc->reg_writel) gc->reg_writel(val, gc->reg_base + reg_offset); else writel(val, gc->reg_base + reg_offset); } static inline u32 irq_reg_readl(struct irq_chip_generic *gc, int reg_offset) { if (gc->reg_readl) return gc->reg_readl(gc->reg_base + reg_offset); else return readl(gc->reg_base + reg_offset); } struct irq_matrix; struct irq_matrix *irq_alloc_matrix(unsigned int matrix_bits, unsigned int alloc_start, unsigned int alloc_end); void irq_matrix_online(struct irq_matrix *m); void irq_matrix_offline(struct irq_matrix *m); void irq_matrix_assign_system(struct irq_matrix *m, unsigned int bit, bool replace); int irq_matrix_reserve_managed(struct irq_matrix *m, const struct cpumask *msk); void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk); int irq_matrix_alloc_managed(struct irq_matrix *m, const struct cpumask *msk, unsigned int *mapped_cpu); void irq_matrix_reserve(struct irq_matrix *m); void irq_matrix_remove_reserved(struct irq_matrix *m); int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk, bool reserved, unsigned int *mapped_cpu); void irq_matrix_free(struct irq_matrix *m, unsigned int cpu, unsigned int bit, bool managed); void irq_matrix_assign(struct irq_matrix *m, unsigned int bit); unsigned int irq_matrix_available(struct irq_matrix *m, bool cpudown); unsigned int irq_matrix_allocated(struct irq_matrix *m); unsigned int irq_matrix_reserved(struct irq_matrix *m); void irq_matrix_debug_show(struct seq_file *sf, struct irq_matrix *m, int ind); /* Contrary to Linux irqs, for hardware irqs the irq number 0 is valid */ #define INVALID_HWIRQ (~0UL) irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu); int __ipi_send_single(struct irq_desc *desc, unsigned int cpu); int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest); int ipi_send_single(unsigned int virq, unsigned int cpu); int ipi_send_mask(unsigned int virq, const struct cpumask *dest); #ifdef CONFIG_GENERIC_IRQ_MULTI_HANDLER /* * Registers a generic IRQ handling function as the top-level IRQ handler in * the system, which is generally the first C code called from an assembly * architecture-specific interrupt handler. * * Returns 0 on success, or -EBUSY if an IRQ handler has already been * registered. */ int __init set_handle_irq(void (*handle_irq)(struct pt_regs *)); /* * Allows interrupt handlers to find the irqchip that's been registered as the * top-level IRQ handler. */ extern void (*handle_arch_irq)(struct pt_regs *) __ro_after_init; #endif #endif /* _LINUX_IRQ_H */ |
50 42 50 1 1 35 1 1 34 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 | // SPDX-License-Identifier: GPL-2.0 #include <net/ip.h> #include <net/udp.h> #include <net/udplite.h> #include <asm/checksum.h> #ifndef _HAVE_ARCH_IPV6_CSUM __sum16 csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr, __u32 len, __u8 proto, __wsum csum) { int carry; __u32 ulen; __u32 uproto; __u32 sum = (__force u32)csum; sum += (__force u32)saddr->s6_addr32[0]; carry = (sum < (__force u32)saddr->s6_addr32[0]); sum += carry; sum += (__force u32)saddr->s6_addr32[1]; carry = (sum < (__force u32)saddr->s6_addr32[1]); sum += carry; sum += (__force u32)saddr->s6_addr32[2]; carry = (sum < (__force u32)saddr->s6_addr32[2]); sum += carry; sum += (__force u32)saddr->s6_addr32[3]; carry = (sum < (__force u32)saddr->s6_addr32[3]); sum += carry; sum += (__force u32)daddr->s6_addr32[0]; carry = (sum < (__force u32)daddr->s6_addr32[0]); sum += carry; sum += (__force u32)daddr->s6_addr32[1]; carry = (sum < (__force u32)daddr->s6_addr32[1]); sum += carry; sum += (__force u32)daddr->s6_addr32[2]; carry = (sum < (__force u32)daddr->s6_addr32[2]); sum += carry; sum += (__force u32)daddr->s6_addr32[3]; carry = (sum < (__force u32)daddr->s6_addr32[3]); sum += carry; ulen = (__force u32)htonl((__u32) len); sum += ulen; carry = (sum < ulen); sum += carry; uproto = (__force u32)htonl(proto); sum += uproto; carry = (sum < uproto); sum += carry; return csum_fold((__force __wsum)sum); } EXPORT_SYMBOL(csum_ipv6_magic); #endif int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto) { int err; UDP_SKB_CB(skb)->partial_cov = 0; UDP_SKB_CB(skb)->cscov = skb->len; if (proto == IPPROTO_UDPLITE) { err = udplite_checksum_init(skb, uh); if (err) return err; if (UDP_SKB_CB(skb)->partial_cov) { skb->csum = ip6_compute_pseudo(skb, proto); return 0; } } /* To support RFC 6936 (allow zero checksum in UDP/IPV6 for tunnels) * we accept a checksum of zero here. When we find the socket * for the UDP packet we'll check if that socket allows zero checksum * for IPv6 (set by socket option). * * Note, we are only interested in != 0 or == 0, thus the * force to int. */ err = (__force int)skb_checksum_init_zero_check(skb, proto, uh->check, ip6_compute_pseudo); if (err) return err; if (skb->ip_summed == CHECKSUM_COMPLETE && !skb->csum_valid) { /* If SW calculated the value, we know it's bad */ if (skb->csum_complete_sw) return 1; /* HW says the value is bad. Let's validate that. * skb->csum is no longer the full packet checksum, * so don't treat is as such. */ skb_checksum_complete_unset(skb); } return 0; } EXPORT_SYMBOL(udp6_csum_init); /* Function to set UDP checksum for an IPv6 UDP packet. This is intended * for the simple case like when setting the checksum for a UDP tunnel. */ void udp6_set_csum(bool nocheck, struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr, int len) { struct udphdr *uh = udp_hdr(skb); if (nocheck) uh->check = 0; else if (skb_is_gso(skb)) uh->check = ~udp_v6_check(len, saddr, daddr, 0); else if (skb->ip_summed == CHECKSUM_PARTIAL) { uh->check = 0; uh->check = udp_v6_check(len, saddr, daddr, lco_csum(skb)); if (uh->check == 0) uh->check = CSUM_MANGLED_0; } else { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct udphdr, check); uh->check = ~udp_v6_check(len, saddr, daddr, 0); } } EXPORT_SYMBOL(udp6_set_csum); |
13 15 41 358 86 64 19 21 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TTY_H #define _LINUX_TTY_H #include <linux/fs.h> #include <linux/major.h> #include <linux/termios.h> #include <linux/workqueue.h> #include <linux/tty_driver.h> #include <linux/tty_ldisc.h> #include <linux/mutex.h> #include <linux/tty_flags.h> #include <linux/seq_file.h> #include <uapi/linux/tty.h> #include <linux/rwsem.h> #include <linux/llist.h> /* * Lock subclasses for tty locks * * TTY_LOCK_NORMAL is for normal ttys and master ptys. * TTY_LOCK_SLAVE is for slave ptys only. * * Lock subclasses are necessary for handling nested locking with pty pairs. * tty locks which use nested locking: * * legacy_mutex - Nested tty locks are necessary for releasing pty pairs. * The stable lock order is master pty first, then slave pty. * termios_rwsem - The stable lock order is tty_buffer lock->termios_rwsem. * Subclassing this lock enables the slave pty to hold its * termios_rwsem when claiming the master tty_buffer lock. * tty_buffer lock - slave ptys can claim nested buffer lock when handling * signal chars. The stable lock order is slave pty, then * master. */ enum { TTY_LOCK_NORMAL = 0, TTY_LOCK_SLAVE, }; /* * (Note: the *_driver.minor_start values 1, 64, 128, 192 are * hardcoded at present.) */ #define NR_UNIX98_PTY_DEFAULT 4096 /* Default maximum for Unix98 ptys */ #define NR_UNIX98_PTY_RESERVE 1024 /* Default reserve for main devpts */ #define NR_UNIX98_PTY_MAX (1 << MINORBITS) /* Absolute limit */ /* * This character is the same as _POSIX_VDISABLE: it cannot be used as * a c_cc[] character, but indicates that a particular special character * isn't in use (eg VINTR has no character etc) */ #define __DISABLED_CHAR '\0' struct tty_buffer { union { struct tty_buffer *next; struct llist_node free; }; int used; int size; int commit; int read; int flags; /* Data points here */ unsigned long data[]; }; /* Values for .flags field of tty_buffer */ #define TTYB_NORMAL 1 /* buffer has no flags buffer */ static inline unsigned char *char_buf_ptr(struct tty_buffer *b, int ofs) { return ((unsigned char *)b->data) + ofs; } static inline char *flag_buf_ptr(struct tty_buffer *b, int ofs) { return (char *)char_buf_ptr(b, ofs) + b->size; } struct tty_bufhead { struct tty_buffer *head; /* Queue head */ struct work_struct work; struct mutex lock; atomic_t priority; struct tty_buffer sentinel; struct llist_head free; /* Free queue head */ atomic_t mem_used; /* In-use buffers excluding free list */ int mem_limit; struct tty_buffer *tail; /* Active buffer */ }; /* * When a break, frame error, or parity error happens, these codes are * stuffed into the flags buffer. */ #define TTY_NORMAL 0 #define TTY_BREAK 1 #define TTY_FRAME 2 #define TTY_PARITY 3 #define TTY_OVERRUN 4 #define INTR_CHAR(tty) ((tty)->termios.c_cc[VINTR]) #define QUIT_CHAR(tty) ((tty)->termios.c_cc[VQUIT]) #define ERASE_CHAR(tty) ((tty)->termios.c_cc[VERASE]) #define KILL_CHAR(tty) ((tty)->termios.c_cc[VKILL]) #define EOF_CHAR(tty) ((tty)->termios.c_cc[VEOF]) #define TIME_CHAR(tty) ((tty)->termios.c_cc[VTIME]) #define MIN_CHAR(tty) ((tty)->termios.c_cc[VMIN]) #define SWTC_CHAR(tty) ((tty)->termios.c_cc[VSWTC]) #define START_CHAR(tty) ((tty)->termios.c_cc[VSTART]) #define STOP_CHAR(tty) ((tty)->termios.c_cc[VSTOP]) #define SUSP_CHAR(tty) ((tty)->termios.c_cc[VSUSP]) #define EOL_CHAR(tty) ((tty)->termios.c_cc[VEOL]) #define REPRINT_CHAR(tty) ((tty)->termios.c_cc[VREPRINT]) #define DISCARD_CHAR(tty) ((tty)->termios.c_cc[VDISCARD]) #define WERASE_CHAR(tty) ((tty)->termios.c_cc[VWERASE]) #define LNEXT_CHAR(tty) ((tty)->termios.c_cc[VLNEXT]) #define EOL2_CHAR(tty) ((tty)->termios.c_cc[VEOL2]) #define _I_FLAG(tty, f) ((tty)->termios.c_iflag & (f)) #define _O_FLAG(tty, f) ((tty)->termios.c_oflag & (f)) #define _C_FLAG(tty, f) ((tty)->termios.c_cflag & (f)) #define _L_FLAG(tty, f) ((tty)->termios.c_lflag & (f)) #define I_IGNBRK(tty) _I_FLAG((tty), IGNBRK) #define I_BRKINT(tty) _I_FLAG((tty), BRKINT) #define I_IGNPAR(tty) _I_FLAG((tty), IGNPAR) #define I_PARMRK(tty) _I_FLAG((tty), PARMRK) #define I_INPCK(tty) _I_FLAG((tty), INPCK) #define I_ISTRIP(tty) _I_FLAG((tty), ISTRIP) #define I_INLCR(tty) _I_FLAG((tty), INLCR) #define I_IGNCR(tty) _I_FLAG((tty), IGNCR) #define I_ICRNL(tty) _I_FLAG((tty), ICRNL) #define I_IUCLC(tty) _I_FLAG((tty), IUCLC) #define I_IXON(tty) _I_FLAG((tty), IXON) #define I_IXANY(tty) _I_FLAG((tty), IXANY) #define I_IXOFF(tty) _I_FLAG((tty), IXOFF) #define I_IMAXBEL(tty) _I_FLAG((tty), IMAXBEL) #define I_IUTF8(tty) _I_FLAG((tty), IUTF8) #define O_OPOST(tty) _O_FLAG((tty), OPOST) #define O_OLCUC(tty) _O_FLAG((tty), OLCUC) #define O_ONLCR(tty) _O_FLAG((tty), ONLCR) #define O_OCRNL(tty) _O_FLAG((tty), OCRNL) #define O_ONOCR(tty) _O_FLAG((tty), ONOCR) #define O_ONLRET(tty) _O_FLAG((tty), ONLRET) #define O_OFILL(tty) _O_FLAG((tty), OFILL) #define O_OFDEL(tty) _O_FLAG((tty), OFDEL) #define O_NLDLY(tty) _O_FLAG((tty), NLDLY) #define O_CRDLY(tty) _O_FLAG((tty), CRDLY) #define O_TABDLY(tty) _O_FLAG((tty), TABDLY) #define O_BSDLY(tty) _O_FLAG((tty), BSDLY) #define O_VTDLY(tty) _O_FLAG((tty), VTDLY) #define O_FFDLY(tty) _O_FLAG((tty), FFDLY) #define C_BAUD(tty) _C_FLAG((tty), CBAUD) #define C_CSIZE(tty) _C_FLAG((tty), CSIZE) #define C_CSTOPB(tty) _C_FLAG((tty), CSTOPB) #define C_CREAD(tty) _C_FLAG((tty), CREAD) #define C_PARENB(tty) _C_FLAG((tty), PARENB) #define C_PARODD(tty) _C_FLAG((tty), PARODD) #define C_HUPCL(tty) _C_FLAG((tty), HUPCL) #define C_CLOCAL(tty) _C_FLAG((tty), CLOCAL) #define C_CIBAUD(tty) _C_FLAG((tty), CIBAUD) #define C_CRTSCTS(tty) _C_FLAG((tty), CRTSCTS) #define C_CMSPAR(tty) _C_FLAG((tty), CMSPAR) #define L_ISIG(tty) _L_FLAG((tty), ISIG) #define L_ICANON(tty) _L_FLAG((tty), ICANON) #define L_XCASE(tty) _L_FLAG((tty), XCASE) #define L_ECHO(tty) _L_FLAG((tty), ECHO) #define L_ECHOE(tty) _L_FLAG((tty), ECHOE) #define L_ECHOK(tty) _L_FLAG((tty), ECHOK) #define L_ECHONL(tty) _L_FLAG((tty), ECHONL) #define L_NOFLSH(tty) _L_FLAG((tty), NOFLSH) #define L_TOSTOP(tty) _L_FLAG((tty), TOSTOP) #define L_ECHOCTL(tty) _L_FLAG((tty), ECHOCTL) #define L_ECHOPRT(tty) _L_FLAG((tty), ECHOPRT) #define L_ECHOKE(tty) _L_FLAG((tty), ECHOKE) #define L_FLUSHO(tty) _L_FLAG((tty), FLUSHO) #define L_PENDIN(tty) _L_FLAG((tty), PENDIN) #define L_IEXTEN(tty) _L_FLAG((tty), IEXTEN) #define L_EXTPROC(tty) _L_FLAG((tty), EXTPROC) struct device; struct signal_struct; /* * Port level information. Each device keeps its own port level information * so provide a common structure for those ports wanting to use common support * routines. * * The tty port has a different lifetime to the tty so must be kept apart. * In addition be careful as tty -> port mappings are valid for the life * of the tty object but in many cases port -> tty mappings are valid only * until a hangup so don't use the wrong path. */ struct tty_port; struct tty_port_operations { /* Return 1 if the carrier is raised */ int (*carrier_raised)(struct tty_port *port); /* Control the DTR line */ void (*dtr_rts)(struct tty_port *port, int raise); /* Called when the last close completes or a hangup finishes IFF the port was initialized. Do not use to free resources. Called under the port mutex to serialize against activate/shutdowns */ void (*shutdown)(struct tty_port *port); /* Called under the port mutex from tty_port_open, serialized using the port mutex */ /* FIXME: long term getting the tty argument *out* of this would be good for consoles */ int (*activate)(struct tty_port *port, struct tty_struct *tty); /* Called on the final put of a port */ void (*destruct)(struct tty_port *port); }; struct tty_port_client_operations { int (*receive_buf)(struct tty_port *port, const unsigned char *, const unsigned char *, size_t); void (*write_wakeup)(struct tty_port *port); }; extern const struct tty_port_client_operations tty_port_default_client_ops; struct tty_port { struct tty_bufhead buf; /* Locked internally */ struct tty_struct *tty; /* Back pointer */ struct tty_struct *itty; /* internal back ptr */ const struct tty_port_operations *ops; /* Port operations */ const struct tty_port_client_operations *client_ops; /* Port client operations */ spinlock_t lock; /* Lock protecting tty field */ int blocked_open; /* Waiting to open */ int count; /* Usage count */ wait_queue_head_t open_wait; /* Open waiters */ wait_queue_head_t delta_msr_wait; /* Modem status change */ unsigned long flags; /* User TTY flags ASYNC_ */ unsigned long iflags; /* Internal flags TTY_PORT_ */ unsigned char console:1, /* port is a console */ low_latency:1; /* optional: tune for latency */ struct mutex mutex; /* Locking */ struct mutex buf_mutex; /* Buffer alloc lock */ unsigned char *xmit_buf; /* Optional buffer */ unsigned int close_delay; /* Close port delay */ unsigned int closing_wait; /* Delay for output */ int drain_delay; /* Set to zero if no pure time based drain is needed else set to size of fifo */ struct kref kref; /* Ref counter */ void *client_data; }; /* tty_port::iflags bits -- use atomic bit ops */ #define TTY_PORT_INITIALIZED 0 /* device is initialized */ #define TTY_PORT_SUSPENDED 1 /* device is suspended */ #define TTY_PORT_ACTIVE 2 /* device is open */ /* * uart drivers: use the uart_port::status field and the UPSTAT_* defines * for s/w-based flow control steering and carrier detection status */ #define TTY_PORT_CTS_FLOW 3 /* h/w flow control enabled */ #define TTY_PORT_CHECK_CD 4 /* carrier detect enabled */ #define TTY_PORT_KOPENED 5 /* device exclusively opened by kernel */ /* * Where all of the state associated with a tty is kept while the tty * is open. Since the termios state should be kept even if the tty * has been closed --- for things like the baud rate, etc --- it is * not stored here, but rather a pointer to the real state is stored * here. Possible the winsize structure should have the same * treatment, but (1) the default 80x24 is usually right and (2) it's * most often used by a windowing system, which will set the correct * size each time the window is created or resized anyway. * - TYT, 9/14/92 */ struct tty_operations; struct tty_struct { int magic; struct kref kref; struct device *dev; struct tty_driver *driver; const struct tty_operations *ops; int index; /* Protects ldisc changes: Lock tty not pty */ struct ld_semaphore ldisc_sem; struct tty_ldisc *ldisc; struct mutex atomic_write_lock; struct mutex legacy_mutex; struct mutex throttle_mutex; struct rw_semaphore termios_rwsem; struct mutex winsize_mutex; spinlock_t ctrl_lock; spinlock_t flow_lock; /* Termios values are protected by the termios rwsem */ struct ktermios termios, termios_locked; struct termiox *termiox; /* May be NULL for unsupported */ char name[64]; struct pid *pgrp; /* Protected by ctrl lock */ /* * Writes protected by both ctrl lock and legacy mutex, readers must use * at least one of them. */ struct pid *session; unsigned long flags; int count; struct winsize winsize; /* winsize_mutex */ unsigned long stopped:1, /* flow_lock */ flow_stopped:1, unused:BITS_PER_LONG - 2; int hw_stopped; unsigned long ctrl_status:8, /* ctrl_lock */ packet:1, unused_ctrl:BITS_PER_LONG - 9; unsigned int receive_room; /* Bytes free for queue */ int flow_change; struct tty_struct *link; struct fasync_struct *fasync; wait_queue_head_t write_wait; wait_queue_head_t read_wait; struct work_struct hangup_work; void *disc_data; void *driver_data; spinlock_t files_lock; /* protects tty_files list */ struct list_head tty_files; #define N_TTY_BUF_SIZE 4096 int closing; unsigned char *write_buf; int write_cnt; /* If the tty has a pending do_SAK, queue it here - akpm */ struct work_struct SAK_work; struct tty_port *port; } __randomize_layout; /* Each of a tty's open files has private_data pointing to tty_file_private */ struct tty_file_private { struct tty_struct *tty; struct file *file; struct list_head list; }; /* tty magic number */ #define TTY_MAGIC 0x5401 /* * These bits are used in the flags field of the tty structure. * * So that interrupts won't be able to mess up the queues, * copy_to_cooked must be atomic with respect to itself, as must * tty->write. Thus, you must use the inline functions set_bit() and * clear_bit() to make things atomic. */ #define TTY_THROTTLED 0 /* Call unthrottle() at threshold min */ #define TTY_IO_ERROR 1 /* Cause an I/O error (may be no ldisc too) */ #define TTY_OTHER_CLOSED 2 /* Other side (if any) has closed */ #define TTY_EXCLUSIVE 3 /* Exclusive open mode */ #define TTY_DO_WRITE_WAKEUP 5 /* Call write_wakeup after queuing new */ #define TTY_LDISC_OPEN 11 /* Line discipline is open */ #define TTY_PTY_LOCK 16 /* pty private */ #define TTY_NO_WRITE_SPLIT 17 /* Preserve write boundaries to driver */ #define TTY_HUPPED 18 /* Post driver->hangup() */ #define TTY_HUPPING 19 /* Hangup in progress */ #define TTY_LDISC_CHANGING 20 /* Change pending - non-block IO */ #define TTY_LDISC_HALTED 22 /* Line discipline is halted */ /* Values for tty->flow_change */ #define TTY_THROTTLE_SAFE 1 #define TTY_UNTHROTTLE_SAFE 2 static inline void __tty_set_flow_change(struct tty_struct *tty, int val) { tty->flow_change = val; } static inline void tty_set_flow_change(struct tty_struct *tty, int val) { tty->flow_change = val; smp_mb(); } static inline bool tty_io_nonblock(struct tty_struct *tty, struct file *file) { return file->f_flags & O_NONBLOCK || test_bit(TTY_LDISC_CHANGING, &tty->flags); } static inline bool tty_io_error(struct tty_struct *tty) { return test_bit(TTY_IO_ERROR, &tty->flags); } static inline bool tty_throttled(struct tty_struct *tty) { return test_bit(TTY_THROTTLED, &tty->flags); } #ifdef CONFIG_TTY extern void tty_kref_put(struct tty_struct *tty); extern struct pid *tty_get_pgrp(struct tty_struct *tty); extern void tty_vhangup_self(void); extern void disassociate_ctty(int priv); extern dev_t tty_devnum(struct tty_struct *tty); extern void proc_clear_tty(struct task_struct *p); extern struct tty_struct *get_current_tty(void); /* tty_io.c */ extern int __init tty_init(void); extern const char *tty_name(const struct tty_struct *tty); extern struct tty_struct *tty_kopen(dev_t device); extern void tty_kclose(struct tty_struct *tty); extern int tty_dev_name_to_number(const char *name, dev_t *number); extern int tty_ldisc_lock(struct tty_struct *tty, unsigned long timeout); extern void tty_ldisc_unlock(struct tty_struct *tty); #else static inline void tty_kref_put(struct tty_struct *tty) { } static inline struct pid *tty_get_pgrp(struct tty_struct *tty) { return NULL; } static inline void tty_vhangup_self(void) { } static inline void disassociate_ctty(int priv) { } static inline dev_t tty_devnum(struct tty_struct *tty) { return 0; } static inline void proc_clear_tty(struct task_struct *p) { } static inline struct tty_struct *get_current_tty(void) { return NULL; } /* tty_io.c */ static inline int __init tty_init(void) { return 0; } static inline const char *tty_name(const struct tty_struct *tty) { return "(none)"; } static inline struct tty_struct *tty_kopen(dev_t device) { return ERR_PTR(-ENODEV); } static inline void tty_kclose(struct tty_struct *tty) { } static inline int tty_dev_name_to_number(const char *name, dev_t *number) { return -ENOTSUPP; } #endif extern struct ktermios tty_std_termios; extern int vcs_init(void); extern struct class *tty_class; /** * tty_kref_get - get a tty reference * @tty: tty device * * Return a new reference to a tty object. The caller must hold * sufficient locks/counts to ensure that their existing reference cannot * go away */ static inline struct tty_struct *tty_kref_get(struct tty_struct *tty) { if (tty) kref_get(&tty->kref); return tty; } extern const char *tty_driver_name(const struct tty_struct *tty); extern void tty_wait_until_sent(struct tty_struct *tty, long timeout); extern int __tty_check_change(struct tty_struct *tty, int sig); extern int tty_check_change(struct tty_struct *tty); extern void __stop_tty(struct tty_struct *tty); extern void stop_tty(struct tty_struct *tty); extern void __start_tty(struct tty_struct *tty); extern void start_tty(struct tty_struct *tty); extern int tty_register_driver(struct tty_driver *driver); extern int tty_unregister_driver(struct tty_driver *driver); extern struct device *tty_register_device(struct tty_driver *driver, unsigned index, struct device *dev); extern struct device *tty_register_device_attr(struct tty_driver *driver, unsigned index, struct device *device, void *drvdata, const struct attribute_group **attr_grp); extern void tty_unregister_device(struct tty_driver *driver, unsigned index); extern void tty_write_message(struct tty_struct *tty, char *msg); extern int tty_send_xchar(struct tty_struct *tty, char ch); extern int tty_put_char(struct tty_struct *tty, unsigned char c); extern int tty_chars_in_buffer(struct tty_struct *tty); extern int tty_write_room(struct tty_struct *tty); extern void tty_driver_flush_buffer(struct tty_struct *tty); extern void tty_throttle(struct tty_struct *tty); extern void tty_unthrottle(struct tty_struct *tty); extern int tty_throttle_safe(struct tty_struct *tty); extern int tty_unthrottle_safe(struct tty_struct *tty); extern int tty_do_resize(struct tty_struct *tty, struct winsize *ws); extern int is_current_pgrp_orphaned(void); extern void tty_hangup(struct tty_struct *tty); extern void tty_vhangup(struct tty_struct *tty); extern void tty_vhangup_session(struct tty_struct *tty); extern int tty_hung_up_p(struct file *filp); extern void do_SAK(struct tty_struct *tty); extern void __do_SAK(struct tty_struct *tty); extern void tty_open_proc_set_tty(struct file *filp, struct tty_struct *tty); extern int tty_signal_session_leader(struct tty_struct *tty, int exit_session); extern void session_clear_tty(struct pid *session); extern void no_tty(void); extern void tty_buffer_free_all(struct tty_port *port); extern void tty_buffer_flush(struct tty_struct *tty, struct tty_ldisc *ld); extern void tty_buffer_init(struct tty_port *port); extern void tty_buffer_set_lock_subclass(struct tty_port *port); extern bool tty_buffer_restart_work(struct tty_port *port); extern bool tty_buffer_cancel_work(struct tty_port *port); extern void tty_buffer_flush_work(struct tty_port *port); extern speed_t tty_termios_baud_rate(struct ktermios *termios); extern speed_t tty_termios_input_baud_rate(struct ktermios *termios); extern void tty_termios_encode_baud_rate(struct ktermios *termios, speed_t ibaud, speed_t obaud); extern void tty_encode_baud_rate(struct tty_struct *tty, speed_t ibaud, speed_t obaud); /** * tty_get_baud_rate - get tty bit rates * @tty: tty to query * * Returns the baud rate as an integer for this terminal. The * termios lock must be held by the caller and the terminal bit * flags may be updated. * * Locking: none */ static inline speed_t tty_get_baud_rate(struct tty_struct *tty) { return tty_termios_baud_rate(&tty->termios); } extern void tty_termios_copy_hw(struct ktermios *new, struct ktermios *old); extern int tty_termios_hw_change(const struct ktermios *a, const struct ktermios *b); extern int tty_set_termios(struct tty_struct *tty, struct ktermios *kt); extern struct tty_ldisc *tty_ldisc_ref(struct tty_struct *); extern void tty_ldisc_deref(struct tty_ldisc *); extern struct tty_ldisc *tty_ldisc_ref_wait(struct tty_struct *); extern void tty_ldisc_hangup(struct tty_struct *tty, bool reset); extern int tty_ldisc_reinit(struct tty_struct *tty, int disc); extern const struct seq_operations tty_ldiscs_seq_ops; extern void tty_wakeup(struct tty_struct *tty); extern void tty_ldisc_flush(struct tty_struct *tty); extern long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg); extern int tty_mode_ioctl(struct tty_struct *tty, struct file *file, unsigned int cmd, unsigned long arg); extern long tty_jobctrl_ioctl(struct tty_struct *tty, struct tty_struct *real_tty, struct file *file, unsigned int cmd, unsigned long arg); extern int tty_perform_flush(struct tty_struct *tty, unsigned long arg); extern void tty_default_fops(struct file_operations *fops); extern struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx); extern int tty_alloc_file(struct file *file); extern void tty_add_file(struct tty_struct *tty, struct file *file); extern void tty_free_file(struct file *file); extern struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx); extern void tty_release_struct(struct tty_struct *tty, int idx); extern int tty_release(struct inode *inode, struct file *filp); extern void tty_init_termios(struct tty_struct *tty); extern void tty_save_termios(struct tty_struct *tty); extern int tty_standard_install(struct tty_driver *driver, struct tty_struct *tty); extern struct mutex tty_mutex; #define tty_is_writelocked(tty) (mutex_is_locked(&tty->atomic_write_lock)) extern void tty_port_init(struct tty_port *port); extern void tty_port_link_device(struct tty_port *port, struct tty_driver *driver, unsigned index); extern struct device *tty_port_register_device(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *device); extern struct device *tty_port_register_device_attr(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *device, void *drvdata, const struct attribute_group **attr_grp); extern struct device *tty_port_register_device_serdev(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *device); extern struct device *tty_port_register_device_attr_serdev(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *device, void *drvdata, const struct attribute_group **attr_grp); extern void tty_port_unregister_device(struct tty_port *port, struct tty_driver *driver, unsigned index); extern int tty_port_alloc_xmit_buf(struct tty_port *port); extern void tty_port_free_xmit_buf(struct tty_port *port); extern void tty_port_destroy(struct tty_port *port); extern void tty_port_put(struct tty_port *port); static inline struct tty_port *tty_port_get(struct tty_port *port) { if (port && kref_get_unless_zero(&port->kref)) return port; return NULL; } /* If the cts flow control is enabled, return true. */ static inline bool tty_port_cts_enabled(struct tty_port *port) { return test_bit(TTY_PORT_CTS_FLOW, &port->iflags); } static inline void tty_port_set_cts_flow(struct tty_port *port, bool val) { if (val) set_bit(TTY_PORT_CTS_FLOW, &port->iflags); else clear_bit(TTY_PORT_CTS_FLOW, &port->iflags); } static inline bool tty_port_active(struct tty_port *port) { return test_bit(TTY_PORT_ACTIVE, &port->iflags); } static inline void tty_port_set_active(struct tty_port *port, bool val) { if (val) set_bit(TTY_PORT_ACTIVE, &port->iflags); else clear_bit(TTY_PORT_ACTIVE, &port->iflags); } static inline bool tty_port_check_carrier(struct tty_port *port) { return test_bit(TTY_PORT_CHECK_CD, &port->iflags); } static inline void tty_port_set_check_carrier(struct tty_port *port, bool val) { if (val) set_bit(TTY_PORT_CHECK_CD, &port->iflags); else clear_bit(TTY_PORT_CHECK_CD, &port->iflags); } static inline bool tty_port_suspended(struct tty_port *port) { return test_bit(TTY_PORT_SUSPENDED, &port->iflags); } static inline void tty_port_set_suspended(struct tty_port *port, bool val) { if (val) set_bit(TTY_PORT_SUSPENDED, &port->iflags); else clear_bit(TTY_PORT_SUSPENDED, &port->iflags); } static inline bool tty_port_initialized(struct tty_port *port) { return test_bit(TTY_PORT_INITIALIZED, &port->iflags); } static inline void tty_port_set_initialized(struct tty_port *port, bool val) { if (val) set_bit(TTY_PORT_INITIALIZED, &port->iflags); else clear_bit(TTY_PORT_INITIALIZED, &port->iflags); } static inline bool tty_port_kopened(struct tty_port *port) { return test_bit(TTY_PORT_KOPENED, &port->iflags); } static inline void tty_port_set_kopened(struct tty_port *port, bool val) { if (val) set_bit(TTY_PORT_KOPENED, &port->iflags); else clear_bit(TTY_PORT_KOPENED, &port->iflags); } extern struct tty_struct *tty_port_tty_get(struct tty_port *port); extern void tty_port_tty_set(struct tty_port *port, struct tty_struct *tty); extern int tty_port_carrier_raised(struct tty_port *port); extern void tty_port_raise_dtr_rts(struct tty_port *port); extern void tty_port_lower_dtr_rts(struct tty_port *port); extern void tty_port_hangup(struct tty_port *port); extern void tty_port_tty_hangup(struct tty_port *port, bool check_clocal); extern void tty_port_tty_wakeup(struct tty_port *port); extern int tty_port_block_til_ready(struct tty_port *port, struct tty_struct *tty, struct file *filp); extern int tty_port_close_start(struct tty_port *port, struct tty_struct *tty, struct file *filp); extern void tty_port_close_end(struct tty_port *port, struct tty_struct *tty); extern void tty_port_close(struct tty_port *port, struct tty_struct *tty, struct file *filp); extern int tty_port_install(struct tty_port *port, struct tty_driver *driver, struct tty_struct *tty); extern int tty_port_open(struct tty_port *port, struct tty_struct *tty, struct file *filp); static inline int tty_port_users(struct tty_port *port) { return port->count + port->blocked_open; } extern int tty_register_ldisc(int disc, struct tty_ldisc_ops *new_ldisc); extern int tty_unregister_ldisc(int disc); extern int tty_set_ldisc(struct tty_struct *tty, int disc); extern int tty_ldisc_setup(struct tty_struct *tty, struct tty_struct *o_tty); extern void tty_ldisc_release(struct tty_struct *tty); extern int __must_check tty_ldisc_init(struct tty_struct *tty); extern void tty_ldisc_deinit(struct tty_struct *tty); extern int tty_ldisc_receive_buf(struct tty_ldisc *ld, const unsigned char *p, char *f, int count); /* n_tty.c */ extern void n_tty_inherit_ops(struct tty_ldisc_ops *ops); #ifdef CONFIG_TTY extern void __init n_tty_init(void); #else static inline void n_tty_init(void) { } #endif /* tty_audit.c */ #ifdef CONFIG_AUDIT extern void tty_audit_add_data(struct tty_struct *tty, const void *data, size_t size); extern void tty_audit_exit(void); extern void tty_audit_fork(struct signal_struct *sig); extern void tty_audit_tiocsti(struct tty_struct *tty, char ch); extern int tty_audit_push(void); #else static inline void tty_audit_add_data(struct tty_struct *tty, const void *data, size_t size) { } static inline void tty_audit_tiocsti(struct tty_struct *tty, char ch) { } static inline void tty_audit_exit(void) { } static inline void tty_audit_fork(struct signal_struct *sig) { } static inline int tty_audit_push(void) { return 0; } #endif /* tty_ioctl.c */ extern int n_tty_ioctl_helper(struct tty_struct *tty, struct file *file, unsigned int cmd, unsigned long arg); extern long n_tty_compat_ioctl_helper(struct tty_struct *tty, struct file *file, unsigned int cmd, unsigned long arg); /* vt.c */ extern int vt_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg); extern long vt_compat_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg); /* tty_mutex.c */ /* functions for preparation of BKL removal */ extern void tty_lock(struct tty_struct *tty); extern int tty_lock_interruptible(struct tty_struct *tty); extern void tty_unlock(struct tty_struct *tty); extern void tty_lock_slave(struct tty_struct *tty); extern void tty_unlock_slave(struct tty_struct *tty); extern void tty_set_lock_subclass(struct tty_struct *tty); #ifdef CONFIG_PROC_FS extern void proc_tty_register_driver(struct tty_driver *); extern void proc_tty_unregister_driver(struct tty_driver *); #else static inline void proc_tty_register_driver(struct tty_driver *d) {} static inline void proc_tty_unregister_driver(struct tty_driver *d) {} #endif #define tty_msg(fn, tty, f, ...) \ fn("%s %s: " f, tty_driver_name(tty), tty_name(tty), ##__VA_ARGS__) #define tty_debug(tty, f, ...) tty_msg(pr_debug, tty, f, ##__VA_ARGS__) #define tty_info(tty, f, ...) tty_msg(pr_info, tty, f, ##__VA_ARGS__) #define tty_notice(tty, f, ...) tty_msg(pr_notice, tty, f, ##__VA_ARGS__) #define tty_warn(tty, f, ...) tty_msg(pr_warn, tty, f, ##__VA_ARGS__) #define tty_err(tty, f, ...) tty_msg(pr_err, tty, f, ##__VA_ARGS__) #define tty_info_ratelimited(tty, f, ...) \ tty_msg(pr_info_ratelimited, tty, f, ##__VA_ARGS__) #endif |
1 1 1 1 1 1 1 1 1 30 3 30 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 | /* * drivers/base/power/trace.c * * Copyright (C) 2006 Linus Torvalds * * Trace facility for suspend/resume problems, when none of the * devices may be working. */ #include <linux/pm-trace.h> #include <linux/export.h> #include <linux/rtc.h> #include <linux/suspend.h> #include <linux/init.h> #include <linux/mc146818rtc.h> #include "power.h" /* * Horrid, horrid, horrid. * * It turns out that the _only_ piece of hardware that actually * keeps its value across a hard boot (and, more importantly, the * POST init sequence) is literally the realtime clock. * * Never mind that an RTC chip has 114 bytes (and often a whole * other bank of an additional 128 bytes) of nice SRAM that is * _designed_ to keep data - the POST will clear it. So we literally * can just use the few bytes of actual time data, which means that * we're really limited. * * It means, for example, that we can't use the seconds at all * (since the time between the hang and the boot might be more * than a minute), and we'd better not depend on the low bits of * the minutes either. * * There are the wday fields etc, but I wouldn't guarantee those * are dependable either. And if the date isn't valid, either the * hw or POST will do strange things. * * So we're left with: * - year: 0-99 * - month: 0-11 * - day-of-month: 1-28 * - hour: 0-23 * - min: (0-30)*2 * * Giving us a total range of 0-16128000 (0xf61800), ie less * than 24 bits of actual data we can save across reboots. * * And if your box can't boot in less than three minutes, * you're screwed. * * Now, almost 24 bits of data is pitifully small, so we need * to be pretty dense if we want to use it for anything nice. * What we do is that instead of saving off nice readable info, * we save off _hashes_ of information that we can hopefully * regenerate after the reboot. * * In particular, this means that we might be unlucky, and hit * a case where we have a hash collision, and we end up not * being able to tell for certain exactly which case happened. * But that's hopefully unlikely. * * What we do is to take the bits we can fit, and split them * into three parts (16*997*1009 = 16095568), and use the values * for: * - 0-15: user-settable * - 0-996: file + line number * - 0-1008: device */ #define USERHASH (16) #define FILEHASH (997) #define DEVHASH (1009) #define DEVSEED (7919) bool pm_trace_rtc_abused __read_mostly; EXPORT_SYMBOL_GPL(pm_trace_rtc_abused); static unsigned int dev_hash_value; static int set_magic_time(unsigned int user, unsigned int file, unsigned int device) { unsigned int n = user + USERHASH*(file + FILEHASH*device); // June 7th, 2006 static struct rtc_time time = { .tm_sec = 0, .tm_min = 0, .tm_hour = 0, .tm_mday = 7, .tm_mon = 5, // June - counting from zero .tm_year = 106, .tm_wday = 3, .tm_yday = 160, .tm_isdst = 1 }; time.tm_year = (n % 100); n /= 100; time.tm_mon = (n % 12); n /= 12; time.tm_mday = (n % 28) + 1; n /= 28; time.tm_hour = (n % 24); n /= 24; time.tm_min = (n % 20) * 3; n /= 20; mc146818_set_time(&time); pm_trace_rtc_abused = true; return n ? -1 : 0; } static unsigned int read_magic_time(void) { struct rtc_time time; unsigned int val; mc146818_get_time(&time); pr_info("RTC time: %2d:%02d:%02d, date: %02d/%02d/%02d\n", time.tm_hour, time.tm_min, time.tm_sec, time.tm_mon + 1, time.tm_mday, time.tm_year % 100); val = time.tm_year; /* 100 years */ if (val > 100) val -= 100; val += time.tm_mon * 100; /* 12 months */ val += (time.tm_mday-1) * 100 * 12; /* 28 month-days */ val += time.tm_hour * 100 * 12 * 28; /* 24 hours */ val += (time.tm_min / 3) * 100 * 12 * 28 * 24; /* 20 3-minute intervals */ return val; } /* * This is just the sdbm hash function with a user-supplied * seed and final size parameter. */ static unsigned int hash_string(unsigned int seed, const char *data, unsigned int mod) { unsigned char c; while ((c = *data++) != 0) { seed = (seed << 16) + (seed << 6) - seed + c; } return seed % mod; } void set_trace_device(struct device *dev) { dev_hash_value = hash_string(DEVSEED, dev_name(dev), DEVHASH); } EXPORT_SYMBOL(set_trace_device); /* * We could just take the "tracedata" index into the .tracedata * section instead. Generating a hash of the data gives us a * chance to work across kernel versions, and perhaps more * importantly it also gives us valid/invalid check (ie we will * likely not give totally bogus reports - if the hash matches, * it's not any guarantee, but it's a high _likelihood_ that * the match is valid). */ void generate_pm_trace(const void *tracedata, unsigned int user) { unsigned short lineno = *(unsigned short *)tracedata; const char *file = *(const char **)(tracedata + 2); unsigned int user_hash_value, file_hash_value; if (!x86_platform.legacy.rtc) return; user_hash_value = user % USERHASH; file_hash_value = hash_string(lineno, file, FILEHASH); set_magic_time(user_hash_value, file_hash_value, dev_hash_value); } EXPORT_SYMBOL(generate_pm_trace); extern char __tracedata_start[], __tracedata_end[]; static int show_file_hash(unsigned int value) { int match; char *tracedata; match = 0; for (tracedata = __tracedata_start ; tracedata < __tracedata_end ; tracedata += 2 + sizeof(unsigned long)) { unsigned short lineno = *(unsigned short *)tracedata; const char *file = *(const char **)(tracedata + 2); unsigned int hash = hash_string(lineno, file, FILEHASH); if (hash != value) continue; pr_info(" hash matches %s:%u\n", file, lineno); match++; } return match; } static int show_dev_hash(unsigned int value) { int match = 0; struct list_head *entry; device_pm_lock(); entry = dpm_list.prev; while (entry != &dpm_list) { struct device * dev = to_device(entry); unsigned int hash = hash_string(DEVSEED, dev_name(dev), DEVHASH); if (hash == value) { dev_info(dev, "hash matches\n"); match++; } entry = entry->prev; } device_pm_unlock(); return match; } static unsigned int hash_value_early_read; int show_trace_dev_match(char *buf, size_t size) { unsigned int value = hash_value_early_read / (USERHASH * FILEHASH); int ret = 0; struct list_head *entry; /* * It's possible that multiple devices will match the hash and we can't * tell which is the culprit, so it's best to output them all. */ device_pm_lock(); entry = dpm_list.prev; while (size && entry != &dpm_list) { struct device *dev = to_device(entry); unsigned int hash = hash_string(DEVSEED, dev_name(dev), DEVHASH); if (hash == value) { int len = snprintf(buf, size, "%s\n", dev_driver_string(dev)); if (len > size) len = size; buf += len; ret += len; size -= len; } entry = entry->prev; } device_pm_unlock(); return ret; } static int pm_trace_notify(struct notifier_block *nb, unsigned long mode, void *_unused) { switch (mode) { case PM_POST_HIBERNATION: case PM_POST_SUSPEND: if (pm_trace_rtc_abused) { pm_trace_rtc_abused = false; pr_warn("Possible incorrect RTC due to pm_trace, please use 'ntpdate' or 'rdate' to reset it.\n"); } break; default: break; } return 0; } static struct notifier_block pm_trace_nb = { .notifier_call = pm_trace_notify, }; static int early_resume_init(void) { if (!x86_platform.legacy.rtc) return 0; hash_value_early_read = read_magic_time(); register_pm_notifier(&pm_trace_nb); return 0; } static int late_resume_init(void) { unsigned int val = hash_value_early_read; unsigned int user, file, dev; if (!x86_platform.legacy.rtc) return 0; user = val % USERHASH; val = val / USERHASH; file = val % FILEHASH; val = val / FILEHASH; dev = val /* % DEVHASH */; pr_info(" Magic number: %d:%d:%d\n", user, file, dev); show_file_hash(file); show_dev_hash(dev); return 0; } core_initcall(early_resume_init); late_initcall(late_resume_init); |
60 60 16 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. * All Rights Reserved. */ #ifndef __XFS_LOG_FORMAT_H__ #define __XFS_LOG_FORMAT_H__ struct xfs_mount; struct xfs_trans_res; /* * On-disk Log Format definitions. * * This file contains all the on-disk format definitions used within the log. It * includes the physical log structure itself, as well as all the log item * format structures that are written into the log and intepreted by log * recovery. We start with the physical log format definitions, and then work * through all the log items definitions and everything they encode into the * log. */ typedef uint32_t xlog_tid_t; #define XLOG_MIN_ICLOGS 2 #define XLOG_MAX_ICLOGS 8 #define XLOG_HEADER_MAGIC_NUM 0xFEEDbabe /* Invalid cycle number */ #define XLOG_VERSION_1 1 #define XLOG_VERSION_2 2 /* Large IClogs, Log sunit */ #define XLOG_VERSION_OKBITS (XLOG_VERSION_1 | XLOG_VERSION_2) #define XLOG_MIN_RECORD_BSIZE (16*1024) /* eventually 32k */ #define XLOG_BIG_RECORD_BSIZE (32*1024) /* 32k buffers */ #define XLOG_MAX_RECORD_BSIZE (256*1024) #define XLOG_HEADER_CYCLE_SIZE (32*1024) /* cycle data in header */ #define XLOG_MIN_RECORD_BSHIFT 14 /* 16384 == 1 << 14 */ #define XLOG_BIG_RECORD_BSHIFT 15 /* 32k == 1 << 15 */ #define XLOG_MAX_RECORD_BSHIFT 18 /* 256k == 1 << 18 */ #define XLOG_BTOLSUNIT(log, b) (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \ (log)->l_mp->m_sb.sb_logsunit) #define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit) #define XLOG_HEADER_SIZE 512 /* Minimum number of transactions that must fit in the log (defined by mkfs) */ #define XFS_MIN_LOG_FACTOR 3 #define XLOG_REC_SHIFT(log) \ BTOBB(1 << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) #define XLOG_TOTAL_REC_SHIFT(log) \ BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) /* get lsn fields */ #define CYCLE_LSN(lsn) ((uint)((lsn)>>32)) #define BLOCK_LSN(lsn) ((uint)(lsn)) /* this is used in a spot where we might otherwise double-endian-flip */ #define CYCLE_LSN_DISK(lsn) (((__be32 *)&(lsn))[0]) static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block) { return ((xfs_lsn_t)cycle << 32) | block; } static inline uint xlog_get_cycle(char *ptr) { if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM) return be32_to_cpu(*((__be32 *)ptr + 1)); else return be32_to_cpu(*(__be32 *)ptr); } /* Log Clients */ #define XFS_TRANSACTION 0x69 #define XFS_VOLUME 0x2 #define XFS_LOG 0xaa #define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */ /* * Log item for unmount records. * * The unmount record used to have a string "Unmount filesystem--" in the * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE). * We just write the magic number now; see xfs_log_unmount_write. */ struct xfs_unmount_log_format { uint16_t magic; /* XLOG_UNMOUNT_TYPE */ uint16_t pad1; uint32_t pad2; /* may as well make it 64 bits */ }; /* Region types for iovec's i_type */ #define XLOG_REG_TYPE_BFORMAT 1 #define XLOG_REG_TYPE_BCHUNK 2 #define XLOG_REG_TYPE_EFI_FORMAT 3 #define XLOG_REG_TYPE_EFD_FORMAT 4 #define XLOG_REG_TYPE_IFORMAT 5 #define XLOG_REG_TYPE_ICORE 6 #define XLOG_REG_TYPE_IEXT 7 #define XLOG_REG_TYPE_IBROOT 8 #define XLOG_REG_TYPE_ILOCAL 9 #define XLOG_REG_TYPE_IATTR_EXT 10 #define XLOG_REG_TYPE_IATTR_BROOT 11 #define XLOG_REG_TYPE_IATTR_LOCAL 12 #define XLOG_REG_TYPE_QFORMAT 13 #define XLOG_REG_TYPE_DQUOT 14 #define XLOG_REG_TYPE_QUOTAOFF 15 #define XLOG_REG_TYPE_LRHEADER 16 #define XLOG_REG_TYPE_UNMOUNT 17 #define XLOG_REG_TYPE_COMMIT 18 #define XLOG_REG_TYPE_TRANSHDR 19 #define XLOG_REG_TYPE_ICREATE 20 #define XLOG_REG_TYPE_RUI_FORMAT 21 #define XLOG_REG_TYPE_RUD_FORMAT 22 #define XLOG_REG_TYPE_CUI_FORMAT 23 #define XLOG_REG_TYPE_CUD_FORMAT 24 #define XLOG_REG_TYPE_BUI_FORMAT 25 #define XLOG_REG_TYPE_BUD_FORMAT 26 #define XLOG_REG_TYPE_MAX 26 /* * Flags to log operation header * * The first write of a new transaction will be preceded with a start * record, XLOG_START_TRANS. Once a transaction is committed, a commit * record is written, XLOG_COMMIT_TRANS. If a single region can not fit into * the remainder of the current active in-core log, it is split up into * multiple regions. Each partial region will be marked with a * XLOG_CONTINUE_TRANS until the last one, which gets marked with XLOG_END_TRANS. * */ #define XLOG_START_TRANS 0x01 /* Start a new transaction */ #define XLOG_COMMIT_TRANS 0x02 /* Commit this transaction */ #define XLOG_CONTINUE_TRANS 0x04 /* Cont this trans into new region */ #define XLOG_WAS_CONT_TRANS 0x08 /* Cont this trans into new region */ #define XLOG_END_TRANS 0x10 /* End a continued transaction */ #define XLOG_UNMOUNT_TRANS 0x20 /* Unmount a filesystem transaction */ typedef struct xlog_op_header { __be32 oh_tid; /* transaction id of operation : 4 b */ __be32 oh_len; /* bytes in data region : 4 b */ __u8 oh_clientid; /* who sent me this : 1 b */ __u8 oh_flags; /* : 1 b */ __u16 oh_res2; /* 32 bit align : 2 b */ } xlog_op_header_t; /* valid values for h_fmt */ #define XLOG_FMT_UNKNOWN 0 #define XLOG_FMT_LINUX_LE 1 #define XLOG_FMT_LINUX_BE 2 #define XLOG_FMT_IRIX_BE 3 /* our fmt */ #ifdef XFS_NATIVE_HOST #define XLOG_FMT XLOG_FMT_LINUX_BE #else #define XLOG_FMT XLOG_FMT_LINUX_LE #endif typedef struct xlog_rec_header { __be32 h_magicno; /* log record (LR) identifier : 4 */ __be32 h_cycle; /* write cycle of log : 4 */ __be32 h_version; /* LR version : 4 */ __be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */ __be64 h_lsn; /* lsn of this LR : 8 */ __be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */ __le32 h_crc; /* crc of log record : 4 */ __be32 h_prev_block; /* block number to previous LR : 4 */ __be32 h_num_logops; /* number of log operations in this LR : 4 */ __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* new fields */ __be32 h_fmt; /* format of log record : 4 */ uuid_t h_fs_uuid; /* uuid of FS : 16 */ __be32 h_size; /* iclog size : 4 */ } xlog_rec_header_t; typedef struct xlog_rec_ext_header { __be32 xh_cycle; /* write cycle of log : 4 */ __be32 xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* : 256 */ } xlog_rec_ext_header_t; /* * Quite misnamed, because this union lays out the actual on-disk log buffer. */ typedef union xlog_in_core2 { xlog_rec_header_t hic_header; xlog_rec_ext_header_t hic_xheader; char hic_sector[XLOG_HEADER_SIZE]; } xlog_in_core_2_t; /* not an on-disk structure, but needed by log recovery in userspace */ typedef struct xfs_log_iovec { void *i_addr; /* beginning address of region */ int i_len; /* length in bytes of region */ uint i_type; /* type of region */ } xfs_log_iovec_t; /* * Transaction Header definitions. * * This is the structure written in the log at the head of every transaction. It * identifies the type and id of the transaction, and contains the number of * items logged by the transaction so we know how many to expect during * recovery. * * Do not change the below structure without redoing the code in * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans(). */ typedef struct xfs_trans_header { uint th_magic; /* magic number */ uint th_type; /* transaction type */ int32_t th_tid; /* transaction id (unused) */ uint th_num_items; /* num items logged by trans */ } xfs_trans_header_t; #define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */ /* * The only type valid for th_type in CIL-enabled file system logs: */ #define XFS_TRANS_CHECKPOINT 40 /* * Log item types. */ #define XFS_LI_EFI 0x1236 #define XFS_LI_EFD 0x1237 #define XFS_LI_IUNLINK 0x1238 #define XFS_LI_INODE 0x123b /* aligned ino chunks, var-size ibufs */ #define XFS_LI_BUF 0x123c /* v2 bufs, variable sized inode bufs */ #define XFS_LI_DQUOT 0x123d #define XFS_LI_QUOTAOFF 0x123e #define XFS_LI_ICREATE 0x123f #define XFS_LI_RUI 0x1240 /* rmap update intent */ #define XFS_LI_RUD 0x1241 #define XFS_LI_CUI 0x1242 /* refcount update intent */ #define XFS_LI_CUD 0x1243 #define XFS_LI_BUI 0x1244 /* bmbt update intent */ #define XFS_LI_BUD 0x1245 #define XFS_LI_TYPE_DESC \ { XFS_LI_EFI, "XFS_LI_EFI" }, \ { XFS_LI_EFD, "XFS_LI_EFD" }, \ { XFS_LI_IUNLINK, "XFS_LI_IUNLINK" }, \ { XFS_LI_INODE, "XFS_LI_INODE" }, \ { XFS_LI_BUF, "XFS_LI_BUF" }, \ { XFS_LI_DQUOT, "XFS_LI_DQUOT" }, \ { XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" }, \ { XFS_LI_ICREATE, "XFS_LI_ICREATE" }, \ { XFS_LI_RUI, "XFS_LI_RUI" }, \ { XFS_LI_RUD, "XFS_LI_RUD" }, \ { XFS_LI_CUI, "XFS_LI_CUI" }, \ { XFS_LI_CUD, "XFS_LI_CUD" }, \ { XFS_LI_BUI, "XFS_LI_BUI" }, \ { XFS_LI_BUD, "XFS_LI_BUD" } /* * Inode Log Item Format definitions. * * This is the structure used to lay out an inode log item in the * log. The size of the inline data/extents/b-tree root to be logged * (if any) is indicated in the ilf_dsize field. Changes to this structure * must be added on to the end. */ struct xfs_inode_log_format { uint16_t ilf_type; /* inode log item type */ uint16_t ilf_size; /* size of this item */ uint32_t ilf_fields; /* flags for fields logged */ uint16_t ilf_asize; /* size of attr d/ext/root */ uint16_t ilf_dsize; /* size of data/ext/root */ uint32_t ilf_pad; /* pad for 64 bit boundary */ uint64_t ilf_ino; /* inode number */ union { uint32_t ilfu_rdev; /* rdev value for dev inode*/ uint8_t __pad[16]; /* unused */ } ilf_u; int64_t ilf_blkno; /* blkno of inode buffer */ int32_t ilf_len; /* len of inode buffer */ int32_t ilf_boffset; /* off of inode in buffer */ }; /* * Old 32 bit systems will log in this format without the 64 bit * alignment padding. Recovery will detect this and convert it to the * correct format. */ struct xfs_inode_log_format_32 { uint16_t ilf_type; /* inode log item type */ uint16_t ilf_size; /* size of this item */ uint32_t ilf_fields; /* flags for fields logged */ uint16_t ilf_asize; /* size of attr d/ext/root */ uint16_t ilf_dsize; /* size of data/ext/root */ uint64_t ilf_ino; /* inode number */ union { uint32_t ilfu_rdev; /* rdev value for dev inode*/ uint8_t __pad[16]; /* unused */ } ilf_u; int64_t ilf_blkno; /* blkno of inode buffer */ int32_t ilf_len; /* len of inode buffer */ int32_t ilf_boffset; /* off of inode in buffer */ } __attribute__((packed)); /* * Flags for xfs_trans_log_inode flags field. */ #define XFS_ILOG_CORE 0x001 /* log standard inode fields */ #define XFS_ILOG_DDATA 0x002 /* log i_df.if_data */ #define XFS_ILOG_DEXT 0x004 /* log i_df.if_extents */ #define XFS_ILOG_DBROOT 0x008 /* log i_df.i_broot */ #define XFS_ILOG_DEV 0x010 /* log the dev field */ #define XFS_ILOG_UUID 0x020 /* added long ago, but never used */ #define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */ #define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */ #define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */ #define XFS_ILOG_DOWNER 0x200 /* change the data fork owner on replay */ #define XFS_ILOG_AOWNER 0x400 /* change the attr fork owner on replay */ /* * The timestamps are dirty, but not necessarily anything else in the inode * core. Unlike the other fields above this one must never make it to disk * in the ilf_fields of the inode_log_format, but is purely store in-memory in * ili_fields in the inode_log_item. */ #define XFS_ILOG_TIMESTAMP 0x4000 #define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ XFS_ILOG_DBROOT | XFS_ILOG_DEV | \ XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ XFS_ILOG_ABROOT | XFS_ILOG_DOWNER | \ XFS_ILOG_AOWNER) #define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ XFS_ILOG_DBROOT) #define XFS_ILOG_AFORK (XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ XFS_ILOG_ABROOT) #define XFS_ILOG_ALL (XFS_ILOG_CORE | XFS_ILOG_DDATA | \ XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \ XFS_ILOG_DEV | XFS_ILOG_ADATA | \ XFS_ILOG_AEXT | XFS_ILOG_ABROOT | \ XFS_ILOG_TIMESTAMP | XFS_ILOG_DOWNER | \ XFS_ILOG_AOWNER) static inline int xfs_ilog_fbroot(int w) { return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT); } static inline int xfs_ilog_fext(int w) { return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT); } static inline int xfs_ilog_fdata(int w) { return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA); } /* * Incore version of the on-disk inode core structures. We log this directly * into the journal in host CPU format (for better or worse) and as such * directly mirrors the xfs_dinode structure as it must contain all the same * information. */ typedef struct xfs_ictimestamp { int32_t t_sec; /* timestamp seconds */ int32_t t_nsec; /* timestamp nanoseconds */ } xfs_ictimestamp_t; /* * Define the format of the inode core that is logged. This structure must be * kept identical to struct xfs_dinode except for the endianness annotations. */ struct xfs_log_dinode { uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */ uint16_t di_mode; /* mode and type of file */ int8_t di_version; /* inode version */ int8_t di_format; /* format of di_c data */ uint8_t di_pad3[2]; /* unused in v2/3 inodes */ uint32_t di_uid; /* owner's user id */ uint32_t di_gid; /* owner's group id */ uint32_t di_nlink; /* number of links to file */ uint16_t di_projid_lo; /* lower part of owner's project id */ uint16_t di_projid_hi; /* higher part of owner's project id */ uint8_t di_pad[6]; /* unused, zeroed space */ uint16_t di_flushiter; /* incremented on flush */ xfs_ictimestamp_t di_atime; /* time last accessed */ xfs_ictimestamp_t di_mtime; /* time last modified */ xfs_ictimestamp_t di_ctime; /* time created/inode modified */ xfs_fsize_t di_size; /* number of bytes in file */ xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */ xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ xfs_extnum_t di_nextents; /* number of extents in data fork */ xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/ uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ int8_t di_aformat; /* format of attr fork's data */ uint32_t di_dmevmask; /* DMIG event mask */ uint16_t di_dmstate; /* DMIG state info */ uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ uint32_t di_gen; /* generation number */ /* di_next_unlinked is the only non-core field in the old dinode */ xfs_agino_t di_next_unlinked;/* agi unlinked list ptr */ /* start of the extended dinode, writable fields */ uint32_t di_crc; /* CRC of the inode */ uint64_t di_changecount; /* number of attribute changes */ xfs_lsn_t di_lsn; /* flush sequence */ uint64_t di_flags2; /* more random flags */ uint32_t di_cowextsize; /* basic cow extent size for file */ uint8_t di_pad2[12]; /* more padding for future expansion */ /* fields only written to during inode creation */ xfs_ictimestamp_t di_crtime; /* time created */ xfs_ino_t di_ino; /* inode number */ uuid_t di_uuid; /* UUID of the filesystem */ /* structure must be padded to 64 bit alignment */ }; static inline uint xfs_log_dinode_size(int version) { if (version == 3) return sizeof(struct xfs_log_dinode); return offsetof(struct xfs_log_dinode, di_next_unlinked); } /* * Buffer Log Format defintions * * These are the physical dirty bitmap defintions for the log format structure. */ #define XFS_BLF_CHUNK 128 #define XFS_BLF_SHIFT 7 #define BIT_TO_WORD_SHIFT 5 #define NBWORD (NBBY * sizeof(unsigned int)) /* * This flag indicates that the buffer contains on disk inodes * and requires special recovery handling. */ #define XFS_BLF_INODE_BUF (1<<0) /* * This flag indicates that the buffer should not be replayed * during recovery because its blocks are being freed. */ #define XFS_BLF_CANCEL (1<<1) /* * This flag indicates that the buffer contains on disk * user or group dquots and may require special recovery handling. */ #define XFS_BLF_UDQUOT_BUF (1<<2) #define XFS_BLF_PDQUOT_BUF (1<<3) #define XFS_BLF_GDQUOT_BUF (1<<4) /* * This is the structure used to lay out a buf log item in the * log. The data map describes which 128 byte chunks of the buffer * have been logged. */ #define XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) typedef struct xfs_buf_log_format { unsigned short blf_type; /* buf log item type indicator */ unsigned short blf_size; /* size of this item */ unsigned short blf_flags; /* misc state */ unsigned short blf_len; /* number of blocks in this buf */ int64_t blf_blkno; /* starting blkno of this buf */ unsigned int blf_map_size; /* used size of data bitmap in words */ unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */ } xfs_buf_log_format_t; /* * All buffers now need to tell recovery where the magic number * is so that it can verify and calculate the CRCs on the buffer correctly * once the changes have been replayed into the buffer. * * The type value is held in the upper 5 bits of the blf_flags field, which is * an unsigned 16 bit field. Hence we need to shift it 11 bits up and down. */ #define XFS_BLFT_BITS 5 #define XFS_BLFT_SHIFT 11 #define XFS_BLFT_MASK (((1 << XFS_BLFT_BITS) - 1) << XFS_BLFT_SHIFT) enum xfs_blft { XFS_BLFT_UNKNOWN_BUF = 0, XFS_BLFT_UDQUOT_BUF, XFS_BLFT_PDQUOT_BUF, XFS_BLFT_GDQUOT_BUF, XFS_BLFT_BTREE_BUF, XFS_BLFT_AGF_BUF, XFS_BLFT_AGFL_BUF, XFS_BLFT_AGI_BUF, XFS_BLFT_DINO_BUF, XFS_BLFT_SYMLINK_BUF, XFS_BLFT_DIR_BLOCK_BUF, XFS_BLFT_DIR_DATA_BUF, XFS_BLFT_DIR_FREE_BUF, XFS_BLFT_DIR_LEAF1_BUF, XFS_BLFT_DIR_LEAFN_BUF, XFS_BLFT_DA_NODE_BUF, XFS_BLFT_ATTR_LEAF_BUF, XFS_BLFT_ATTR_RMT_BUF, XFS_BLFT_SB_BUF, XFS_BLFT_RTBITMAP_BUF, XFS_BLFT_RTSUMMARY_BUF, XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS), }; static inline void xfs_blft_to_flags(struct xfs_buf_log_format *blf, enum xfs_blft type) { ASSERT(type > XFS_BLFT_UNKNOWN_BUF && type < XFS_BLFT_MAX_BUF); blf->blf_flags &= ~XFS_BLFT_MASK; blf->blf_flags |= ((type << XFS_BLFT_SHIFT) & XFS_BLFT_MASK); } static inline uint16_t xfs_blft_from_flags(struct xfs_buf_log_format *blf) { return (blf->blf_flags & XFS_BLFT_MASK) >> XFS_BLFT_SHIFT; } /* * EFI/EFD log format definitions */ typedef struct xfs_extent { xfs_fsblock_t ext_start; xfs_extlen_t ext_len; } xfs_extent_t; /* * Since an xfs_extent_t has types (start:64, len: 32) * there are different alignments on 32 bit and 64 bit kernels. * So we provide the different variants for use by a * conversion routine. */ typedef struct xfs_extent_32 { uint64_t ext_start; uint32_t ext_len; } __attribute__((packed)) xfs_extent_32_t; typedef struct xfs_extent_64 { uint64_t ext_start; uint32_t ext_len; uint32_t ext_pad; } xfs_extent_64_t; /* * This is the structure used to lay out an efi log item in the * log. The efi_extents field is a variable size array whose * size is given by efi_nextents. */ typedef struct xfs_efi_log_format { uint16_t efi_type; /* efi log item type */ uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ xfs_extent_t efi_extents[1]; /* array of extents to free */ } xfs_efi_log_format_t; typedef struct xfs_efi_log_format_32 { uint16_t efi_type; /* efi log item type */ uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ xfs_extent_32_t efi_extents[1]; /* array of extents to free */ } __attribute__((packed)) xfs_efi_log_format_32_t; typedef struct xfs_efi_log_format_64 { uint16_t efi_type; /* efi log item type */ uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ xfs_extent_64_t efi_extents[1]; /* array of extents to free */ } xfs_efi_log_format_64_t; /* * This is the structure used to lay out an efd log item in the * log. The efd_extents array is a variable size array whose * size is given by efd_nextents; */ typedef struct xfs_efd_log_format { uint16_t efd_type; /* efd log item type */ uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ xfs_extent_t efd_extents[1]; /* array of extents freed */ } xfs_efd_log_format_t; typedef struct xfs_efd_log_format_32 { uint16_t efd_type; /* efd log item type */ uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ xfs_extent_32_t efd_extents[1]; /* array of extents freed */ } __attribute__((packed)) xfs_efd_log_format_32_t; typedef struct xfs_efd_log_format_64 { uint16_t efd_type; /* efd log item type */ uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ xfs_extent_64_t efd_extents[1]; /* array of extents freed */ } xfs_efd_log_format_64_t; /* * RUI/RUD (reverse mapping) log format definitions */ struct xfs_map_extent { uint64_t me_owner; uint64_t me_startblock; uint64_t me_startoff; uint32_t me_len; uint32_t me_flags; }; /* rmap me_flags: upper bits are flags, lower byte is type code */ #define XFS_RMAP_EXTENT_MAP 1 #define XFS_RMAP_EXTENT_MAP_SHARED 2 #define XFS_RMAP_EXTENT_UNMAP 3 #define XFS_RMAP_EXTENT_UNMAP_SHARED 4 #define XFS_RMAP_EXTENT_CONVERT 5 #define XFS_RMAP_EXTENT_CONVERT_SHARED 6 #define XFS_RMAP_EXTENT_ALLOC 7 #define XFS_RMAP_EXTENT_FREE 8 #define XFS_RMAP_EXTENT_TYPE_MASK 0xFF #define XFS_RMAP_EXTENT_ATTR_FORK (1U << 31) #define XFS_RMAP_EXTENT_BMBT_BLOCK (1U << 30) #define XFS_RMAP_EXTENT_UNWRITTEN (1U << 29) #define XFS_RMAP_EXTENT_FLAGS (XFS_RMAP_EXTENT_TYPE_MASK | \ XFS_RMAP_EXTENT_ATTR_FORK | \ XFS_RMAP_EXTENT_BMBT_BLOCK | \ XFS_RMAP_EXTENT_UNWRITTEN) /* * This is the structure used to lay out an rui log item in the * log. The rui_extents field is a variable size array whose * size is given by rui_nextents. */ struct xfs_rui_log_format { uint16_t rui_type; /* rui log item type */ uint16_t rui_size; /* size of this item */ uint32_t rui_nextents; /* # extents to free */ uint64_t rui_id; /* rui identifier */ struct xfs_map_extent rui_extents[]; /* array of extents to rmap */ }; static inline size_t xfs_rui_log_format_sizeof( unsigned int nr) { return sizeof(struct xfs_rui_log_format) + nr * sizeof(struct xfs_map_extent); } /* * This is the structure used to lay out an rud log item in the * log. The rud_extents array is a variable size array whose * size is given by rud_nextents; */ struct xfs_rud_log_format { uint16_t rud_type; /* rud log item type */ uint16_t rud_size; /* size of this item */ uint32_t __pad; uint64_t rud_rui_id; /* id of corresponding rui */ }; /* * CUI/CUD (refcount update) log format definitions */ struct xfs_phys_extent { uint64_t pe_startblock; uint32_t pe_len; uint32_t pe_flags; }; /* refcount pe_flags: upper bits are flags, lower byte is type code */ /* Type codes are taken directly from enum xfs_refcount_intent_type. */ #define XFS_REFCOUNT_EXTENT_TYPE_MASK 0xFF #define XFS_REFCOUNT_EXTENT_FLAGS (XFS_REFCOUNT_EXTENT_TYPE_MASK) /* * This is the structure used to lay out a cui log item in the * log. The cui_extents field is a variable size array whose * size is given by cui_nextents. */ struct xfs_cui_log_format { uint16_t cui_type; /* cui log item type */ uint16_t cui_size; /* size of this item */ uint32_t cui_nextents; /* # extents to free */ uint64_t cui_id; /* cui identifier */ struct xfs_phys_extent cui_extents[]; /* array of extents */ }; static inline size_t xfs_cui_log_format_sizeof( unsigned int nr) { return sizeof(struct xfs_cui_log_format) + nr * sizeof(struct xfs_phys_extent); } /* * This is the structure used to lay out a cud log item in the * log. The cud_extents array is a variable size array whose * size is given by cud_nextents; */ struct xfs_cud_log_format { uint16_t cud_type; /* cud log item type */ uint16_t cud_size; /* size of this item */ uint32_t __pad; uint64_t cud_cui_id; /* id of corresponding cui */ }; /* * BUI/BUD (inode block mapping) log format definitions */ /* bmbt me_flags: upper bits are flags, lower byte is type code */ /* Type codes are taken directly from enum xfs_bmap_intent_type. */ #define XFS_BMAP_EXTENT_TYPE_MASK 0xFF #define XFS_BMAP_EXTENT_ATTR_FORK (1U << 31) #define XFS_BMAP_EXTENT_UNWRITTEN (1U << 30) #define XFS_BMAP_EXTENT_FLAGS (XFS_BMAP_EXTENT_TYPE_MASK | \ XFS_BMAP_EXTENT_ATTR_FORK | \ XFS_BMAP_EXTENT_UNWRITTEN) /* * This is the structure used to lay out an bui log item in the * log. The bui_extents field is a variable size array whose * size is given by bui_nextents. */ struct xfs_bui_log_format { uint16_t bui_type; /* bui log item type */ uint16_t bui_size; /* size of this item */ uint32_t bui_nextents; /* # extents to free */ uint64_t bui_id; /* bui identifier */ struct xfs_map_extent bui_extents[]; /* array of extents to bmap */ }; static inline size_t xfs_bui_log_format_sizeof( unsigned int nr) { return sizeof(struct xfs_bui_log_format) + nr * sizeof(struct xfs_map_extent); } /* * This is the structure used to lay out an bud log item in the * log. The bud_extents array is a variable size array whose * size is given by bud_nextents; */ struct xfs_bud_log_format { uint16_t bud_type; /* bud log item type */ uint16_t bud_size; /* size of this item */ uint32_t __pad; uint64_t bud_bui_id; /* id of corresponding bui */ }; /* * Dquot Log format definitions. * * The first two fields must be the type and size fitting into * 32 bits : log_recovery code assumes that. */ typedef struct xfs_dq_logformat { uint16_t qlf_type; /* dquot log item type */ uint16_t qlf_size; /* size of this item */ xfs_dqid_t qlf_id; /* usr/grp/proj id : 32 bits */ int64_t qlf_blkno; /* blkno of dquot buffer */ int32_t qlf_len; /* len of dquot buffer */ uint32_t qlf_boffset; /* off of dquot in buffer */ } xfs_dq_logformat_t; /* * log format struct for QUOTAOFF records. * The first two fields must be the type and size fitting into * 32 bits : log_recovery code assumes that. * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer * to the first and ensures that the first logitem is taken out of the AIL * only when the last one is securely committed. */ typedef struct xfs_qoff_logformat { unsigned short qf_type; /* quotaoff log item type */ unsigned short qf_size; /* size of this item */ unsigned int qf_flags; /* USR and/or GRP */ char qf_pad[12]; /* padding for future */ } xfs_qoff_logformat_t; /* * Disk quotas status in m_qflags, and also sb_qflags. 16 bits. */ #define XFS_UQUOTA_ACCT 0x0001 /* user quota accounting ON */ #define XFS_UQUOTA_ENFD 0x0002 /* user quota limits enforced */ #define XFS_UQUOTA_CHKD 0x0004 /* quotacheck run on usr quotas */ #define XFS_PQUOTA_ACCT 0x0008 /* project quota accounting ON */ #define XFS_OQUOTA_ENFD 0x0010 /* other (grp/prj) quota limits enforced */ #define XFS_OQUOTA_CHKD 0x0020 /* quotacheck run on other (grp/prj) quotas */ #define XFS_GQUOTA_ACCT 0x0040 /* group quota accounting ON */ /* * Conversion to and from the combined OQUOTA flag (if necessary) * is done only in xfs_sb_qflags_to_disk() and xfs_sb_qflags_from_disk() */ #define XFS_GQUOTA_ENFD 0x0080 /* group quota limits enforced */ #define XFS_GQUOTA_CHKD 0x0100 /* quotacheck run on group quotas */ #define XFS_PQUOTA_ENFD 0x0200 /* project quota limits enforced */ #define XFS_PQUOTA_CHKD 0x0400 /* quotacheck run on project quotas */ #define XFS_ALL_QUOTA_ACCT \ (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT) #define XFS_ALL_QUOTA_ENFD \ (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_ENFD) #define XFS_ALL_QUOTA_CHKD \ (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD | XFS_PQUOTA_CHKD) #define XFS_MOUNT_QUOTA_ALL (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\ XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\ XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD|\ XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD|\ XFS_PQUOTA_CHKD) /* * Inode create log item structure * * Log recovery assumes the first two entries are the type and size and they fit * in 32 bits. Also in host order (ugh) so they have to be 32 bit aligned so * decoding can be done correctly. */ struct xfs_icreate_log { uint16_t icl_type; /* type of log format structure */ uint16_t icl_size; /* size of log format structure */ __be32 icl_ag; /* ag being allocated in */ __be32 icl_agbno; /* start block of inode range */ __be32 icl_count; /* number of inodes to initialise */ __be32 icl_isize; /* size of inodes */ __be32 icl_length; /* length of extent to initialise */ __be32 icl_gen; /* inode generation number to use */ }; #endif /* __XFS_LOG_FORMAT_H__ */ |
2 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 | /* * IPV4 GSO/GRO offload support * Linux INET implementation * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * * GRE GSO support */ #include <linux/skbuff.h> #include <linux/init.h> #include <net/protocol.h> #include <net/gre.h> static struct sk_buff *gre_gso_segment(struct sk_buff *skb, netdev_features_t features) { int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); bool need_csum, need_recompute_csum, gso_partial; struct sk_buff *segs = ERR_PTR(-EINVAL); u16 mac_offset = skb->mac_header; __be16 protocol = skb->protocol; u16 mac_len = skb->mac_len; int gre_offset, outer_hlen; if (!skb->encapsulation) goto out; if (unlikely(tnl_hlen < sizeof(struct gre_base_hdr))) goto out; if (unlikely(!pskb_may_pull(skb, tnl_hlen))) goto out; /* setup inner skb. */ skb->encapsulation = 0; SKB_GSO_CB(skb)->encap_level = 0; __skb_pull(skb, tnl_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, skb_inner_network_offset(skb)); skb->mac_len = skb_inner_network_offset(skb); skb->protocol = skb->inner_protocol; need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM); need_recompute_csum = skb->csum_not_inet; skb->encap_hdr_csum = need_csum; features &= skb->dev->hw_enc_features; /* segment inner packet. */ segs = skb_mac_gso_segment(skb, features); if (IS_ERR_OR_NULL(segs)) { skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, mac_len); goto out; } gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL); outer_hlen = skb_tnl_header_len(skb); gre_offset = outer_hlen - tnl_hlen; skb = segs; do { struct gre_base_hdr *greh; __sum16 *pcsum; /* Set up inner headers if we are offloading inner checksum */ if (skb->ip_summed == CHECKSUM_PARTIAL) { skb_reset_inner_headers(skb); skb->encapsulation = 1; } skb->mac_len = mac_len; skb->protocol = protocol; __skb_push(skb, outer_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, mac_len); skb_set_transport_header(skb, gre_offset); if (!need_csum) continue; greh = (struct gre_base_hdr *)skb_transport_header(skb); pcsum = (__sum16 *)(greh + 1); if (gso_partial && skb_is_gso(skb)) { unsigned int partial_adj; /* Adjust checksum to account for the fact that * the partial checksum is based on actual size * whereas headers should be based on MSS size. */ partial_adj = skb->len + skb_headroom(skb) - SKB_GSO_CB(skb)->data_offset - skb_shinfo(skb)->gso_size; *pcsum = ~csum_fold((__force __wsum)htonl(partial_adj)); } else { *pcsum = 0; } *(pcsum + 1) = 0; if (need_recompute_csum && !skb_is_gso(skb)) { __wsum csum; csum = skb_checksum(skb, gre_offset, skb->len - gre_offset, 0); *pcsum = csum_fold(csum); } else { *pcsum = gso_make_checksum(skb, 0); } } while ((skb = skb->next)); out: return segs; } static struct sk_buff *gre_gro_receive(struct list_head *head, struct sk_buff *skb) { struct sk_buff *pp = NULL; struct sk_buff *p; const struct gre_base_hdr *greh; unsigned int hlen, grehlen; unsigned int off; int flush = 1; struct packet_offload *ptype; __be16 type; if (NAPI_GRO_CB(skb)->encap_mark) goto out; NAPI_GRO_CB(skb)->encap_mark = 1; off = skb_gro_offset(skb); hlen = off + sizeof(*greh); greh = skb_gro_header_fast(skb, off); if (skb_gro_header_hard(skb, hlen)) { greh = skb_gro_header_slow(skb, hlen, off); if (unlikely(!greh)) goto out; } /* Only support version 0 and K (key), C (csum) flags. Note that * although the support for the S (seq#) flag can be added easily * for GRO, this is problematic for GSO hence can not be enabled * here because a GRO pkt may end up in the forwarding path, thus * requiring GSO support to break it up correctly. */ if ((greh->flags & ~(GRE_KEY|GRE_CSUM)) != 0) goto out; /* We can only support GRE_CSUM if we can track the location of * the GRE header. In the case of FOU/GUE we cannot because the * outer UDP header displaces the GRE header leaving us in a state * of limbo. */ if ((greh->flags & GRE_CSUM) && NAPI_GRO_CB(skb)->is_fou) goto out; type = greh->protocol; rcu_read_lock(); ptype = gro_find_receive_by_type(type); if (!ptype) goto out_unlock; grehlen = GRE_HEADER_SECTION; if (greh->flags & GRE_KEY) grehlen += GRE_HEADER_SECTION; if (greh->flags & GRE_CSUM) grehlen += GRE_HEADER_SECTION; hlen = off + grehlen; if (skb_gro_header_hard(skb, hlen)) { greh = skb_gro_header_slow(skb, hlen, off); if (unlikely(!greh)) goto out_unlock; } /* Don't bother verifying checksum if we're going to flush anyway. */ if ((greh->flags & GRE_CSUM) && !NAPI_GRO_CB(skb)->flush) { if (skb_gro_checksum_simple_validate(skb)) goto out_unlock; skb_gro_checksum_try_convert(skb, IPPROTO_GRE, 0, null_compute_pseudo); } list_for_each_entry(p, head, list) { const struct gre_base_hdr *greh2; if (!NAPI_GRO_CB(p)->same_flow) continue; /* The following checks are needed to ensure only pkts * from the same tunnel are considered for aggregation. * The criteria for "the same tunnel" includes: * 1) same version (we only support version 0 here) * 2) same protocol (we only support ETH_P_IP for now) * 3) same set of flags * 4) same key if the key field is present. */ greh2 = (struct gre_base_hdr *)(p->data + off); if (greh2->flags != greh->flags || greh2->protocol != greh->protocol) { NAPI_GRO_CB(p)->same_flow = 0; continue; } if (greh->flags & GRE_KEY) { /* compare keys */ if (*(__be32 *)(greh2+1) != *(__be32 *)(greh+1)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } } } skb_gro_pull(skb, grehlen); /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/ skb_gro_postpull_rcsum(skb, greh, grehlen); pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb); flush = 0; out_unlock: rcu_read_unlock(); out: skb_gro_flush_final(skb, pp, flush); return pp; } static int gre_gro_complete(struct sk_buff *skb, int nhoff) { struct gre_base_hdr *greh = (struct gre_base_hdr *)(skb->data + nhoff); struct packet_offload *ptype; unsigned int grehlen = sizeof(*greh); int err = -ENOENT; __be16 type; skb->encapsulation = 1; skb_shinfo(skb)->gso_type = SKB_GSO_GRE; type = greh->protocol; if (greh->flags & GRE_KEY) grehlen += GRE_HEADER_SECTION; if (greh->flags & GRE_CSUM) grehlen += GRE_HEADER_SECTION; rcu_read_lock(); ptype = gro_find_complete_by_type(type); if (ptype) err = ptype->callbacks.gro_complete(skb, nhoff + grehlen); rcu_read_unlock(); skb_set_inner_mac_header(skb, nhoff + grehlen); return err; } static const struct net_offload gre_offload = { .callbacks = { .gso_segment = gre_gso_segment, .gro_receive = gre_gro_receive, .gro_complete = gre_gro_complete, }, }; static int __init gre_offload_init(void) { int err; err = inet_add_offload(&gre_offload, IPPROTO_GRE); #if IS_ENABLED(CONFIG_IPV6) if (err) return err; err = inet6_add_offload(&gre_offload, IPPROTO_GRE); if (err) inet_del_offload(&gre_offload, IPPROTO_GRE); #endif return err; } device_initcall(gre_offload_init); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 | /* * Glue Code for x86_64/AVX2 assembler optimized version of Serpent * * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * */ #include <linux/module.h> #include <linux/types.h> #include <linux/crypto.h> #include <linux/err.h> #include <crypto/algapi.h> #include <crypto/internal/simd.h> #include <crypto/serpent.h> #include <crypto/xts.h> #include <asm/crypto/glue_helper.h> #include <asm/crypto/serpent-avx.h> #define SERPENT_AVX2_PARALLEL_BLOCKS 16 /* 16-way AVX2 parallel cipher functions */ asmlinkage void serpent_ecb_enc_16way(struct serpent_ctx *ctx, u8 *dst, const u8 *src); asmlinkage void serpent_ecb_dec_16way(struct serpent_ctx *ctx, u8 *dst, const u8 *src); asmlinkage void serpent_cbc_dec_16way(void *ctx, u128 *dst, const u128 *src); asmlinkage void serpent_ctr_16way(void *ctx, u128 *dst, const u128 *src, le128 *iv); asmlinkage void serpent_xts_enc_16way(struct serpent_ctx *ctx, u8 *dst, const u8 *src, le128 *iv); asmlinkage void serpent_xts_dec_16way(struct serpent_ctx *ctx, u8 *dst, const u8 *src, le128 *iv); static int serpent_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen); } static const struct common_glue_ctx serpent_enc = { .num_funcs = 3, .fpu_blocks_limit = 8, .funcs = { { .num_blocks = 16, .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_16way) } }, { .num_blocks = 8, .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_8way_avx) } }, { .num_blocks = 1, .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) } } } }; static const struct common_glue_ctx serpent_ctr = { .num_funcs = 3, .fpu_blocks_limit = 8, .funcs = { { .num_blocks = 16, .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_16way) } }, { .num_blocks = 8, .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) } }, { .num_blocks = 1, .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(__serpent_crypt_ctr) } } } }; static const struct common_glue_ctx serpent_enc_xts = { .num_funcs = 3, .fpu_blocks_limit = 8, .funcs = { { .num_blocks = 16, .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_16way) } }, { .num_blocks = 8, .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_8way_avx) } }, { .num_blocks = 1, .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc) } } } }; static const struct common_glue_ctx serpent_dec = { .num_funcs = 3, .fpu_blocks_limit = 8, .funcs = { { .num_blocks = 16, .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_16way) } }, { .num_blocks = 8, .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_8way_avx) } }, { .num_blocks = 1, .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) } } } }; static const struct common_glue_ctx serpent_dec_cbc = { .num_funcs = 3, .fpu_blocks_limit = 8, .funcs = { { .num_blocks = 16, .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_16way) } }, { .num_blocks = 8, .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_8way_avx) } }, { .num_blocks = 1, .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) } } } }; static const struct common_glue_ctx serpent_dec_xts = { .num_funcs = 3, .fpu_blocks_limit = 8, .funcs = { { .num_blocks = 16, .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_16way) } }, { .num_blocks = 8, .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_8way_avx) } }, { .num_blocks = 1, .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec) } } } }; static int ecb_encrypt(struct skcipher_request *req) { return glue_ecb_req_128bit(&serpent_enc, req); } static int ecb_decrypt(struct skcipher_request *req) { return glue_ecb_req_128bit(&serpent_dec, req); } static int cbc_encrypt(struct skcipher_request *req) { return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(__serpent_encrypt), req); } static int cbc_decrypt(struct skcipher_request *req) { return glue_cbc_decrypt_req_128bit(&serpent_dec_cbc, req); } static int ctr_crypt(struct skcipher_request *req) { return glue_ctr_req_128bit(&serpent_ctr, req); } static int xts_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm); return glue_xts_req_128bit(&serpent_enc_xts, req, XTS_TWEAK_CAST(__serpent_encrypt), &ctx->tweak_ctx, &ctx->crypt_ctx); } static int xts_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm); return glue_xts_req_128bit(&serpent_dec_xts, req, XTS_TWEAK_CAST(__serpent_encrypt), &ctx->tweak_ctx, &ctx->crypt_ctx); } static struct skcipher_alg serpent_algs[] = { { .base.cra_name = "__ecb(serpent)", .base.cra_driver_name = "__ecb-serpent-avx2", .base.cra_priority = 600, .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = SERPENT_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct serpent_ctx), .base.cra_module = THIS_MODULE, .min_keysize = SERPENT_MIN_KEY_SIZE, .max_keysize = SERPENT_MAX_KEY_SIZE, .setkey = serpent_setkey_skcipher, .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { .base.cra_name = "__cbc(serpent)", .base.cra_driver_name = "__cbc-serpent-avx2", .base.cra_priority = 600, .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = SERPENT_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct serpent_ctx), .base.cra_module = THIS_MODULE, .min_keysize = SERPENT_MIN_KEY_SIZE, .max_keysize = SERPENT_MAX_KEY_SIZE, .ivsize = SERPENT_BLOCK_SIZE, .setkey = serpent_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, }, { .base.cra_name = "__ctr(serpent)", .base.cra_driver_name = "__ctr-serpent-avx2", .base.cra_priority = 600, .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct serpent_ctx), .base.cra_module = THIS_MODULE, .min_keysize = SERPENT_MIN_KEY_SIZE, .max_keysize = SERPENT_MAX_KEY_SIZE, .ivsize = SERPENT_BLOCK_SIZE, .chunksize = SERPENT_BLOCK_SIZE, .setkey = serpent_setkey_skcipher, .encrypt = ctr_crypt, .decrypt = ctr_crypt, }, { .base.cra_name = "__xts(serpent)", .base.cra_driver_name = "__xts-serpent-avx2", .base.cra_priority = 600, .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = SERPENT_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct serpent_xts_ctx), .base.cra_module = THIS_MODULE, .min_keysize = 2 * SERPENT_MIN_KEY_SIZE, .max_keysize = 2 * SERPENT_MAX_KEY_SIZE, .ivsize = SERPENT_BLOCK_SIZE, .setkey = xts_serpent_setkey, .encrypt = xts_encrypt, .decrypt = xts_decrypt, }, }; static struct simd_skcipher_alg *serpent_simd_algs[ARRAY_SIZE(serpent_algs)]; static int __init init(void) { const char *feature_name; if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_OSXSAVE)) { pr_info("AVX2 instructions are not detected.\n"); return -ENODEV; } if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, &feature_name)) { pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } return simd_register_skciphers_compat(serpent_algs, ARRAY_SIZE(serpent_algs), serpent_simd_algs); } static void __exit fini(void) { simd_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs), serpent_simd_algs); } module_init(init); module_exit(fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX2 optimized"); MODULE_ALIAS_CRYPTO("serpent"); MODULE_ALIAS_CRYPTO("serpent-asm"); |
5 4 36 36 36 41 41 7 3 2 38 38 32 38 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 | /* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) * Copyright Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk) * Copyright Tomi Manninen OH2BNS (oh2bns@sral.fi) */ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/slab.h> #include <net/ax25.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <net/arp.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/termios.h> /* For TIOCINQ/OUTQ */ #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/notifier.h> #include <linux/init.h> #include <linux/spinlock.h> #include <net/netrom.h> #include <linux/seq_file.h> #include <linux/export.h> static unsigned int nr_neigh_no = 1; static HLIST_HEAD(nr_node_list); static DEFINE_SPINLOCK(nr_node_list_lock); static HLIST_HEAD(nr_neigh_list); static DEFINE_SPINLOCK(nr_neigh_list_lock); static struct nr_node *nr_node_get(ax25_address *callsign) { struct nr_node *found = NULL; struct nr_node *nr_node; spin_lock_bh(&nr_node_list_lock); nr_node_for_each(nr_node, &nr_node_list) if (ax25cmp(callsign, &nr_node->callsign) == 0) { nr_node_hold(nr_node); found = nr_node; break; } spin_unlock_bh(&nr_node_list_lock); return found; } static struct nr_neigh *nr_neigh_get_dev(ax25_address *callsign, struct net_device *dev) { struct nr_neigh *found = NULL; struct nr_neigh *nr_neigh; spin_lock_bh(&nr_neigh_list_lock); nr_neigh_for_each(nr_neigh, &nr_neigh_list) if (ax25cmp(callsign, &nr_neigh->callsign) == 0 && nr_neigh->dev == dev) { nr_neigh_hold(nr_neigh); found = nr_neigh; break; } spin_unlock_bh(&nr_neigh_list_lock); return found; } static void nr_remove_neigh(struct nr_neigh *); /* re-sort the routes in quality order. */ static void re_sort_routes(struct nr_node *nr_node, int x, int y) { if (nr_node->routes[y].quality > nr_node->routes[x].quality) { if (nr_node->which == x) nr_node->which = y; else if (nr_node->which == y) nr_node->which = x; swap(nr_node->routes[x], nr_node->routes[y]); } } /* * Add a new route to a node, and in the process add the node and the * neighbour if it is new. */ static int __must_check nr_add_node(ax25_address *nr, const char *mnemonic, ax25_address *ax25, ax25_digi *ax25_digi, struct net_device *dev, int quality, int obs_count) { struct nr_node *nr_node; struct nr_neigh *nr_neigh; int i, found; struct net_device *odev; if ((odev=nr_dev_get(nr)) != NULL) { /* Can't add routes to ourself */ dev_put(odev); return -EINVAL; } nr_node = nr_node_get(nr); nr_neigh = nr_neigh_get_dev(ax25, dev); /* * The L2 link to a neighbour has failed in the past * and now a frame comes from this neighbour. We assume * it was a temporary trouble with the link and reset the * routes now (and not wait for a node broadcast). */ if (nr_neigh != NULL && nr_neigh->failed != 0 && quality == 0) { struct nr_node *nr_nodet; spin_lock_bh(&nr_node_list_lock); nr_node_for_each(nr_nodet, &nr_node_list) { nr_node_lock(nr_nodet); for (i = 0; i < nr_nodet->count; i++) if (nr_nodet->routes[i].neighbour == nr_neigh) if (i < nr_nodet->which) nr_nodet->which = i; nr_node_unlock(nr_nodet); } spin_unlock_bh(&nr_node_list_lock); } if (nr_neigh != NULL) nr_neigh->failed = 0; if (quality == 0 && nr_neigh != NULL && nr_node != NULL) { nr_neigh_put(nr_neigh); nr_node_put(nr_node); return 0; } if (nr_neigh == NULL) { if ((nr_neigh = kmalloc(sizeof(*nr_neigh), GFP_ATOMIC)) == NULL) { if (nr_node) nr_node_put(nr_node); return -ENOMEM; } nr_neigh->callsign = *ax25; nr_neigh->digipeat = NULL; nr_neigh->ax25 = NULL; nr_neigh->dev = dev; nr_neigh->quality = sysctl_netrom_default_path_quality; nr_neigh->locked = 0; nr_neigh->count = 0; nr_neigh->number = nr_neigh_no++; nr_neigh->failed = 0; refcount_set(&nr_neigh->refcount, 1); if (ax25_digi != NULL && ax25_digi->ndigi > 0) { nr_neigh->digipeat = kmemdup(ax25_digi, sizeof(*ax25_digi), GFP_KERNEL); if (nr_neigh->digipeat == NULL) { kfree(nr_neigh); if (nr_node) nr_node_put(nr_node); return -ENOMEM; } } spin_lock_bh(&nr_neigh_list_lock); hlist_add_head(&nr_neigh->neigh_node, &nr_neigh_list); nr_neigh_hold(nr_neigh); spin_unlock_bh(&nr_neigh_list_lock); } if (quality != 0 && ax25cmp(nr, ax25) == 0 && !nr_neigh->locked) nr_neigh->quality = quality; if (nr_node == NULL) { if ((nr_node = kmalloc(sizeof(*nr_node), GFP_ATOMIC)) == NULL) { if (nr_neigh) nr_neigh_put(nr_neigh); return -ENOMEM; } nr_node->callsign = *nr; strcpy(nr_node->mnemonic, mnemonic); nr_node->which = 0; nr_node->count = 1; refcount_set(&nr_node->refcount, 1); spin_lock_init(&nr_node->node_lock); nr_node->routes[0].quality = quality; nr_node->routes[0].obs_count = obs_count; nr_node->routes[0].neighbour = nr_neigh; nr_neigh_hold(nr_neigh); nr_neigh->count++; spin_lock_bh(&nr_node_list_lock); hlist_add_head(&nr_node->node_node, &nr_node_list); /* refcount initialized at 1 */ spin_unlock_bh(&nr_node_list_lock); nr_neigh_put(nr_neigh); return 0; } nr_node_lock(nr_node); if (quality != 0) strcpy(nr_node->mnemonic, mnemonic); for (found = 0, i = 0; i < nr_node->count; i++) { if (nr_node->routes[i].neighbour == nr_neigh) { nr_node->routes[i].quality = quality; nr_node->routes[i].obs_count = obs_count; found = 1; break; } } if (!found) { /* We have space at the bottom, slot it in */ if (nr_node->count < 3) { nr_node->routes[2] = nr_node->routes[1]; nr_node->routes[1] = nr_node->routes[0]; nr_node->routes[0].quality = quality; nr_node->routes[0].obs_count = obs_count; nr_node->routes[0].neighbour = nr_neigh; nr_node->which++; nr_node->count++; nr_neigh_hold(nr_neigh); nr_neigh->count++; } else { /* It must be better than the worst */ if (quality > nr_node->routes[2].quality) { nr_node->routes[2].neighbour->count--; nr_neigh_put(nr_node->routes[2].neighbour); if (nr_node->routes[2].neighbour->count == 0 && !nr_node->routes[2].neighbour->locked) nr_remove_neigh(nr_node->routes[2].neighbour); nr_node->routes[2].quality = quality; nr_node->routes[2].obs_count = obs_count; nr_node->routes[2].neighbour = nr_neigh; nr_neigh_hold(nr_neigh); nr_neigh->count++; } } } /* Now re-sort the routes in quality order */ switch (nr_node->count) { case 3: re_sort_routes(nr_node, 0, 1); re_sort_routes(nr_node, 1, 2); /* fall through */ case 2: re_sort_routes(nr_node, 0, 1); case 1: break; } for (i = 0; i < nr_node->count; i++) { if (nr_node->routes[i].neighbour == nr_neigh) { if (i < nr_node->which) nr_node->which = i; break; } } nr_neigh_put(nr_neigh); nr_node_unlock(nr_node); nr_node_put(nr_node); return 0; } static inline void __nr_remove_node(struct nr_node *nr_node) { hlist_del_init(&nr_node->node_node); nr_node_put(nr_node); } #define nr_remove_node_locked(__node) \ __nr_remove_node(__node) static void nr_remove_node(struct nr_node *nr_node) { spin_lock_bh(&nr_node_list_lock); __nr_remove_node(nr_node); spin_unlock_bh(&nr_node_list_lock); } static inline void __nr_remove_neigh(struct nr_neigh *nr_neigh) { hlist_del_init(&nr_neigh->neigh_node); nr_neigh_put(nr_neigh); } #define nr_remove_neigh_locked(__neigh) \ __nr_remove_neigh(__neigh) static void nr_remove_neigh(struct nr_neigh *nr_neigh) { spin_lock_bh(&nr_neigh_list_lock); __nr_remove_neigh(nr_neigh); spin_unlock_bh(&nr_neigh_list_lock); } /* * "Delete" a node. Strictly speaking remove a route to a node. The node * is only deleted if no routes are left to it. */ static int nr_del_node(ax25_address *callsign, ax25_address *neighbour, struct net_device *dev) { struct nr_node *nr_node; struct nr_neigh *nr_neigh; int i; nr_node = nr_node_get(callsign); if (nr_node == NULL) return -EINVAL; nr_neigh = nr_neigh_get_dev(neighbour, dev); if (nr_neigh == NULL) { nr_node_put(nr_node); return -EINVAL; } nr_node_lock(nr_node); for (i = 0; i < nr_node->count; i++) { if (nr_node->routes[i].neighbour == nr_neigh) { nr_neigh->count--; nr_neigh_put(nr_neigh); if (nr_neigh->count == 0 && !nr_neigh->locked) nr_remove_neigh(nr_neigh); nr_neigh_put(nr_neigh); nr_node->count--; if (nr_node->count == 0) { nr_remove_node(nr_node); } else { switch (i) { case 0: nr_node->routes[0] = nr_node->routes[1]; /* fall through */ case 1: nr_node->routes[1] = nr_node->routes[2]; case 2: break; } nr_node_put(nr_node); } nr_node_unlock(nr_node); return 0; } } nr_neigh_put(nr_neigh); nr_node_unlock(nr_node); nr_node_put(nr_node); return -EINVAL; } /* * Lock a neighbour with a quality. */ static int __must_check nr_add_neigh(ax25_address *callsign, ax25_digi *ax25_digi, struct net_device *dev, unsigned int quality) { struct nr_neigh *nr_neigh; nr_neigh = nr_neigh_get_dev(callsign, dev); if (nr_neigh) { nr_neigh->quality = quality; nr_neigh->locked = 1; nr_neigh_put(nr_neigh); return 0; } if ((nr_neigh = kmalloc(sizeof(*nr_neigh), GFP_ATOMIC)) == NULL) return -ENOMEM; nr_neigh->callsign = *callsign; nr_neigh->digipeat = NULL; nr_neigh->ax25 = NULL; nr_neigh->dev = dev; nr_neigh->quality = quality; nr_neigh->locked = 1; nr_neigh->count = 0; nr_neigh->number = nr_neigh_no++; nr_neigh->failed = 0; refcount_set(&nr_neigh->refcount, 1); if (ax25_digi != NULL && ax25_digi->ndigi > 0) { nr_neigh->digipeat = kmemdup(ax25_digi, sizeof(*ax25_digi), GFP_KERNEL); if (nr_neigh->digipeat == NULL) { kfree(nr_neigh); return -ENOMEM; } } spin_lock_bh(&nr_neigh_list_lock); hlist_add_head(&nr_neigh->neigh_node, &nr_neigh_list); /* refcount is initialized at 1 */ spin_unlock_bh(&nr_neigh_list_lock); return 0; } /* * "Delete" a neighbour. The neighbour is only removed if the number * of nodes that may use it is zero. */ static int nr_del_neigh(ax25_address *callsign, struct net_device *dev, unsigned int quality) { struct nr_neigh *nr_neigh; nr_neigh = nr_neigh_get_dev(callsign, dev); if (nr_neigh == NULL) return -EINVAL; nr_neigh->quality = quality; nr_neigh->locked = 0; if (nr_neigh->count == 0) nr_remove_neigh(nr_neigh); nr_neigh_put(nr_neigh); return 0; } /* * Decrement the obsolescence count by one. If a route is reduced to a * count of zero, remove it. Also remove any unlocked neighbours with * zero nodes routing via it. */ static int nr_dec_obs(void) { struct nr_neigh *nr_neigh; struct nr_node *s; struct hlist_node *nodet; int i; spin_lock_bh(&nr_node_list_lock); nr_node_for_each_safe(s, nodet, &nr_node_list) { nr_node_lock(s); for (i = 0; i < s->count; i++) { switch (s->routes[i].obs_count) { case 0: /* A locked entry */ break; case 1: /* From 1 -> 0 */ nr_neigh = s->routes[i].neighbour; nr_neigh->count--; nr_neigh_put(nr_neigh); if (nr_neigh->count == 0 && !nr_neigh->locked) nr_remove_neigh(nr_neigh); s->count--; switch (i) { case 0: s->routes[0] = s->routes[1]; /* Fallthrough */ case 1: s->routes[1] = s->routes[2]; case 2: break; } break; default: s->routes[i].obs_count--; break; } } if (s->count <= 0) nr_remove_node_locked(s); nr_node_unlock(s); } spin_unlock_bh(&nr_node_list_lock); return 0; } /* * A device has been removed. Remove its routes and neighbours. */ void nr_rt_device_down(struct net_device *dev) { struct nr_neigh *s; struct hlist_node *nodet, *node2t; struct nr_node *t; int i; spin_lock_bh(&nr_neigh_list_lock); nr_neigh_for_each_safe(s, nodet, &nr_neigh_list) { if (s->dev == dev) { spin_lock_bh(&nr_node_list_lock); nr_node_for_each_safe(t, node2t, &nr_node_list) { nr_node_lock(t); for (i = 0; i < t->count; i++) { if (t->routes[i].neighbour == s) { t->count--; switch (i) { case 0: t->routes[0] = t->routes[1]; /* fall through */ case 1: t->routes[1] = t->routes[2]; case 2: break; } } } if (t->count <= 0) nr_remove_node_locked(t); nr_node_unlock(t); } spin_unlock_bh(&nr_node_list_lock); nr_remove_neigh_locked(s); } } spin_unlock_bh(&nr_neigh_list_lock); } /* * Check that the device given is a valid AX.25 interface that is "up". * Or a valid ethernet interface with an AX.25 callsign binding. */ static struct net_device *nr_ax25_dev_get(char *devname) { struct net_device *dev; if ((dev = dev_get_by_name(&init_net, devname)) == NULL) return NULL; if ((dev->flags & IFF_UP) && dev->type == ARPHRD_AX25) return dev; dev_put(dev); return NULL; } /* * Find the first active NET/ROM device, usually "nr0". */ struct net_device *nr_dev_first(void) { struct net_device *dev, *first = NULL; rcu_read_lock(); for_each_netdev_rcu(&init_net, dev) { if ((dev->flags & IFF_UP) && dev->type == ARPHRD_NETROM) if (first == NULL || strncmp(dev->name, first->name, 3) < 0) first = dev; } if (first) dev_hold(first); rcu_read_unlock(); return first; } /* * Find the NET/ROM device for the given callsign. */ struct net_device *nr_dev_get(ax25_address *addr) { struct net_device *dev; rcu_read_lock(); for_each_netdev_rcu(&init_net, dev) { if ((dev->flags & IFF_UP) && dev->type == ARPHRD_NETROM && ax25cmp(addr, (ax25_address *)dev->dev_addr) == 0) { dev_hold(dev); goto out; } } dev = NULL; out: rcu_read_unlock(); return dev; } static ax25_digi *nr_call_to_digi(ax25_digi *digi, int ndigis, ax25_address *digipeaters) { int i; if (ndigis == 0) return NULL; for (i = 0; i < ndigis; i++) { digi->calls[i] = digipeaters[i]; digi->repeated[i] = 0; } digi->ndigi = ndigis; digi->lastrepeat = -1; return digi; } /* * Handle the ioctls that control the routing functions. */ int nr_rt_ioctl(unsigned int cmd, void __user *arg) { struct nr_route_struct nr_route; struct net_device *dev; ax25_digi digi; int ret; switch (cmd) { case SIOCADDRT: if (copy_from_user(&nr_route, arg, sizeof(struct nr_route_struct))) return -EFAULT; if (nr_route.ndigis > AX25_MAX_DIGIS) return -EINVAL; if ((dev = nr_ax25_dev_get(nr_route.device)) == NULL) return -EINVAL; switch (nr_route.type) { case NETROM_NODE: if (strnlen(nr_route.mnemonic, 7) == 7) { ret = -EINVAL; break; } ret = nr_add_node(&nr_route.callsign, nr_route.mnemonic, &nr_route.neighbour, nr_call_to_digi(&digi, nr_route.ndigis, nr_route.digipeaters), dev, nr_route.quality, nr_route.obs_count); break; case NETROM_NEIGH: ret = nr_add_neigh(&nr_route.callsign, nr_call_to_digi(&digi, nr_route.ndigis, nr_route.digipeaters), dev, nr_route.quality); break; default: ret = -EINVAL; } dev_put(dev); return ret; case SIOCDELRT: if (copy_from_user(&nr_route, arg, sizeof(struct nr_route_struct))) return -EFAULT; if ((dev = nr_ax25_dev_get(nr_route.device)) == NULL) return -EINVAL; switch (nr_route.type) { case NETROM_NODE: ret = nr_del_node(&nr_route.callsign, &nr_route.neighbour, dev); break; case NETROM_NEIGH: ret = nr_del_neigh(&nr_route.callsign, dev, nr_route.quality); break; default: ret = -EINVAL; } dev_put(dev); return ret; case SIOCNRDECOBS: return nr_dec_obs(); default: return -EINVAL; } return 0; } /* * A level 2 link has timed out, therefore it appears to be a poor link, * then don't use that neighbour until it is reset. */ void nr_link_failed(ax25_cb *ax25, int reason) { struct nr_neigh *s, *nr_neigh = NULL; struct nr_node *nr_node = NULL; spin_lock_bh(&nr_neigh_list_lock); nr_neigh_for_each(s, &nr_neigh_list) { if (s->ax25 == ax25) { nr_neigh_hold(s); nr_neigh = s; break; } } spin_unlock_bh(&nr_neigh_list_lock); if (nr_neigh == NULL) return; nr_neigh->ax25 = NULL; ax25_cb_put(ax25); if (++nr_neigh->failed < sysctl_netrom_link_fails_count) { nr_neigh_put(nr_neigh); return; } spin_lock_bh(&nr_node_list_lock); nr_node_for_each(nr_node, &nr_node_list) { nr_node_lock(nr_node); if (nr_node->which < nr_node->count && nr_node->routes[nr_node->which].neighbour == nr_neigh) nr_node->which++; nr_node_unlock(nr_node); } spin_unlock_bh(&nr_node_list_lock); nr_neigh_put(nr_neigh); } /* * Route a frame to an appropriate AX.25 connection. A NULL ax25_cb * indicates an internally generated frame. */ int nr_route_frame(struct sk_buff *skb, ax25_cb *ax25) { ax25_address *nr_src, *nr_dest; struct nr_neigh *nr_neigh; struct nr_node *nr_node; struct net_device *dev; unsigned char *dptr; ax25_cb *ax25s; int ret; struct sk_buff *skbn; nr_src = (ax25_address *)(skb->data + 0); nr_dest = (ax25_address *)(skb->data + 7); if (ax25 != NULL) { ret = nr_add_node(nr_src, "", &ax25->dest_addr, ax25->digipeat, ax25->ax25_dev->dev, 0, sysctl_netrom_obsolescence_count_initialiser); if (ret) return ret; } if ((dev = nr_dev_get(nr_dest)) != NULL) { /* Its for me */ if (ax25 == NULL) /* Its from me */ ret = nr_loopback_queue(skb); else ret = nr_rx_frame(skb, dev); dev_put(dev); return ret; } if (!sysctl_netrom_routing_control && ax25 != NULL) return 0; /* Its Time-To-Live has expired */ if (skb->data[14] == 1) { return 0; } nr_node = nr_node_get(nr_dest); if (nr_node == NULL) return 0; nr_node_lock(nr_node); if (nr_node->which >= nr_node->count) { nr_node_unlock(nr_node); nr_node_put(nr_node); return 0; } nr_neigh = nr_node->routes[nr_node->which].neighbour; if ((dev = nr_dev_first()) == NULL) { nr_node_unlock(nr_node); nr_node_put(nr_node); return 0; } /* We are going to change the netrom headers so we should get our own skb, we also did not know until now how much header space we had to reserve... - RXQ */ if ((skbn=skb_copy_expand(skb, dev->hard_header_len, 0, GFP_ATOMIC)) == NULL) { nr_node_unlock(nr_node); nr_node_put(nr_node); dev_put(dev); return 0; } kfree_skb(skb); skb=skbn; skb->data[14]--; dptr = skb_push(skb, 1); *dptr = AX25_P_NETROM; ax25s = nr_neigh->ax25; nr_neigh->ax25 = ax25_send_frame(skb, 256, (ax25_address *)dev->dev_addr, &nr_neigh->callsign, nr_neigh->digipeat, nr_neigh->dev); if (ax25s) ax25_cb_put(ax25s); dev_put(dev); ret = (nr_neigh->ax25 != NULL); nr_node_unlock(nr_node); nr_node_put(nr_node); return ret; } #ifdef CONFIG_PROC_FS static void *nr_node_start(struct seq_file *seq, loff_t *pos) { spin_lock_bh(&nr_node_list_lock); return seq_hlist_start_head(&nr_node_list, *pos); } static void *nr_node_next(struct seq_file *seq, void *v, loff_t *pos) { return seq_hlist_next(v, &nr_node_list, pos); } static void nr_node_stop(struct seq_file *seq, void *v) { spin_unlock_bh(&nr_node_list_lock); } static int nr_node_show(struct seq_file *seq, void *v) { char buf[11]; int i; if (v == SEQ_START_TOKEN) seq_puts(seq, "callsign mnemonic w n qual obs neigh qual obs neigh qual obs neigh\n"); else { struct nr_node *nr_node = hlist_entry(v, struct nr_node, node_node); nr_node_lock(nr_node); seq_printf(seq, "%-9s %-7s %d %d", ax2asc(buf, &nr_node->callsign), (nr_node->mnemonic[0] == '\0') ? "*" : nr_node->mnemonic, nr_node->which + 1, nr_node->count); for (i = 0; i < nr_node->count; i++) { seq_printf(seq, " %3d %d %05d", nr_node->routes[i].quality, nr_node->routes[i].obs_count, nr_node->routes[i].neighbour->number); } nr_node_unlock(nr_node); seq_puts(seq, "\n"); } return 0; } const struct seq_operations nr_node_seqops = { .start = nr_node_start, .next = nr_node_next, .stop = nr_node_stop, .show = nr_node_show, }; static void *nr_neigh_start(struct seq_file *seq, loff_t *pos) { spin_lock_bh(&nr_neigh_list_lock); return seq_hlist_start_head(&nr_neigh_list, *pos); } static void *nr_neigh_next(struct seq_file *seq, void *v, loff_t *pos) { return seq_hlist_next(v, &nr_neigh_list, pos); } static void nr_neigh_stop(struct seq_file *seq, void *v) { spin_unlock_bh(&nr_neigh_list_lock); } static int nr_neigh_show(struct seq_file *seq, void *v) { char buf[11]; int i; if (v == SEQ_START_TOKEN) seq_puts(seq, "addr callsign dev qual lock count failed digipeaters\n"); else { struct nr_neigh *nr_neigh; nr_neigh = hlist_entry(v, struct nr_neigh, neigh_node); seq_printf(seq, "%05d %-9s %-4s %3d %d %3d %3d", nr_neigh->number, ax2asc(buf, &nr_neigh->callsign), nr_neigh->dev ? nr_neigh->dev->name : "???", nr_neigh->quality, nr_neigh->locked, nr_neigh->count, nr_neigh->failed); if (nr_neigh->digipeat != NULL) { for (i = 0; i < nr_neigh->digipeat->ndigi; i++) seq_printf(seq, " %s", ax2asc(buf, &nr_neigh->digipeat->calls[i])); } seq_puts(seq, "\n"); } return 0; } const struct seq_operations nr_neigh_seqops = { .start = nr_neigh_start, .next = nr_neigh_next, .stop = nr_neigh_stop, .show = nr_neigh_show, }; #endif /* * Free all memory associated with the nodes and routes lists. */ void nr_rt_free(void) { struct nr_neigh *s = NULL; struct nr_node *t = NULL; struct hlist_node *nodet; spin_lock_bh(&nr_neigh_list_lock); spin_lock_bh(&nr_node_list_lock); nr_node_for_each_safe(t, nodet, &nr_node_list) { nr_node_lock(t); nr_remove_node_locked(t); nr_node_unlock(t); } nr_neigh_for_each_safe(s, nodet, &nr_neigh_list) { while(s->count) { s->count--; nr_neigh_put(s); } nr_remove_neigh_locked(s); } spin_unlock_bh(&nr_node_list_lock); spin_unlock_bh(&nr_neigh_list_lock); } |
1 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 | /* * Definitions for the 'struct skb_array' datastructure. * * Author: * Michael S. Tsirkin <mst@redhat.com> * * Copyright (C) 2016 Red Hat, Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * Limited-size FIFO of skbs. Can be used more or less whenever * sk_buff_head can be used, except you need to know the queue size in * advance. * Implemented as a type-safe wrapper around ptr_ring. */ #ifndef _LINUX_SKB_ARRAY_H #define _LINUX_SKB_ARRAY_H 1 #ifdef __KERNEL__ #include <linux/ptr_ring.h> #include <linux/skbuff.h> #include <linux/if_vlan.h> #endif struct skb_array { struct ptr_ring ring; }; /* Might be slightly faster than skb_array_full below, but callers invoking * this in a loop must use a compiler barrier, for example cpu_relax(). */ static inline bool __skb_array_full(struct skb_array *a) { return __ptr_ring_full(&a->ring); } static inline bool skb_array_full(struct skb_array *a) { return ptr_ring_full(&a->ring); } static inline int skb_array_produce(struct skb_array *a, struct sk_buff *skb) { return ptr_ring_produce(&a->ring, skb); } static inline int skb_array_produce_irq(struct skb_array *a, struct sk_buff *skb) { return ptr_ring_produce_irq(&a->ring, skb); } static inline int skb_array_produce_bh(struct skb_array *a, struct sk_buff *skb) { return ptr_ring_produce_bh(&a->ring, skb); } static inline int skb_array_produce_any(struct skb_array *a, struct sk_buff *skb) { return ptr_ring_produce_any(&a->ring, skb); } /* Might be slightly faster than skb_array_empty below, but only safe if the * array is never resized. Also, callers invoking this in a loop must take care * to use a compiler barrier, for example cpu_relax(). */ static inline bool __skb_array_empty(struct skb_array *a) { return __ptr_ring_empty(&a->ring); } static inline struct sk_buff *__skb_array_peek(struct skb_array *a) { return __ptr_ring_peek(&a->ring); } static inline bool skb_array_empty(struct skb_array *a) { return ptr_ring_empty(&a->ring); } static inline bool skb_array_empty_bh(struct skb_array *a) { return ptr_ring_empty_bh(&a->ring); } static inline bool skb_array_empty_irq(struct skb_array *a) { return ptr_ring_empty_irq(&a->ring); } static inline bool skb_array_empty_any(struct skb_array *a) { return ptr_ring_empty_any(&a->ring); } static inline struct sk_buff *__skb_array_consume(struct skb_array *a) { return __ptr_ring_consume(&a->ring); } static inline struct sk_buff *skb_array_consume(struct skb_array *a) { return ptr_ring_consume(&a->ring); } static inline int skb_array_consume_batched(struct skb_array *a, struct sk_buff **array, int n) { return ptr_ring_consume_batched(&a->ring, (void **)array, n); } static inline struct sk_buff *skb_array_consume_irq(struct skb_array *a) { return ptr_ring_consume_irq(&a->ring); } static inline int skb_array_consume_batched_irq(struct skb_array *a, struct sk_buff **array, int n) { return ptr_ring_consume_batched_irq(&a->ring, (void **)array, n); } static inline struct sk_buff *skb_array_consume_any(struct skb_array *a) { return ptr_ring_consume_any(&a->ring); } static inline int skb_array_consume_batched_any(struct skb_array *a, struct sk_buff **array, int n) { return ptr_ring_consume_batched_any(&a->ring, (void **)array, n); } static inline struct sk_buff *skb_array_consume_bh(struct skb_array *a) { return ptr_ring_consume_bh(&a->ring); } static inline int skb_array_consume_batched_bh(struct skb_array *a, struct sk_buff **array, int n) { return ptr_ring_consume_batched_bh(&a->ring, (void **)array, n); } static inline int __skb_array_len_with_tag(struct sk_buff *skb) { if (likely(skb)) { int len = skb->len; if (skb_vlan_tag_present(skb)) len += VLAN_HLEN; return len; } else { return 0; } } static inline int skb_array_peek_len(struct skb_array *a) { return PTR_RING_PEEK_CALL(&a->ring, __skb_array_len_with_tag); } static inline int skb_array_peek_len_irq(struct skb_array *a) { return PTR_RING_PEEK_CALL_IRQ(&a->ring, __skb_array_len_with_tag); } static inline int skb_array_peek_len_bh(struct skb_array *a) { return PTR_RING_PEEK_CALL_BH(&a->ring, __skb_array_len_with_tag); } static inline int skb_array_peek_len_any(struct skb_array *a) { return PTR_RING_PEEK_CALL_ANY(&a->ring, __skb_array_len_with_tag); } static inline int skb_array_init(struct skb_array *a, int size, gfp_t gfp) { return ptr_ring_init(&a->ring, size, gfp); } static void __skb_array_destroy_skb(void *ptr) { kfree_skb(ptr); } static inline void skb_array_unconsume(struct skb_array *a, struct sk_buff **skbs, int n) { ptr_ring_unconsume(&a->ring, (void **)skbs, n, __skb_array_destroy_skb); } static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp) { return ptr_ring_resize(&a->ring, size, gfp, __skb_array_destroy_skb); } static inline int skb_array_resize_multiple(struct skb_array **rings, int nrings, unsigned int size, gfp_t gfp) { BUILD_BUG_ON(offsetof(struct skb_array, ring)); return ptr_ring_resize_multiple((struct ptr_ring **)rings, nrings, size, gfp, __skb_array_destroy_skb); } static inline void skb_array_cleanup(struct skb_array *a) { ptr_ring_cleanup(&a->ring, __skb_array_destroy_skb); } #endif /* _LINUX_SKB_ARRAY_H */ |
40 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 | #ifndef DECOMPRESSOR_H #define DECOMPRESSOR_H /* * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 * Phillip Lougher <phillip@squashfs.org.uk> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2, * or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * * decompressor.h */ struct squashfs_decompressor { void *(*init)(struct squashfs_sb_info *, void *); void *(*comp_opts)(struct squashfs_sb_info *, void *, int); void (*free)(void *); int (*decompress)(struct squashfs_sb_info *, void *, struct buffer_head **, int, int, int, struct squashfs_page_actor *); int id; char *name; int supported; }; static inline void *squashfs_comp_opts(struct squashfs_sb_info *msblk, void *buff, int length) { return msblk->decompressor->comp_opts ? msblk->decompressor->comp_opts(msblk, buff, length) : NULL; } #ifdef CONFIG_SQUASHFS_XZ extern const struct squashfs_decompressor squashfs_xz_comp_ops; #endif #ifdef CONFIG_SQUASHFS_LZ4 extern const struct squashfs_decompressor squashfs_lz4_comp_ops; #endif #ifdef CONFIG_SQUASHFS_LZO extern const struct squashfs_decompressor squashfs_lzo_comp_ops; #endif #ifdef CONFIG_SQUASHFS_ZLIB extern const struct squashfs_decompressor squashfs_zlib_comp_ops; #endif #ifdef CONFIG_SQUASHFS_ZSTD extern const struct squashfs_decompressor squashfs_zstd_comp_ops; #endif #endif |
67 67 67 67 67 67 67 67 66 66 66 65 66 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2013 Jie Liu. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_trans_space.h" #include "xfs_inode.h" #include "xfs_da_btree.h" #include "xfs_attr_leaf.h" #include "xfs_bmap_btree.h" /* * Calculate the maximum length in bytes that would be required for a local * attribute value as large attributes out of line are not logged. */ STATIC int xfs_log_calc_max_attrsetm_res( struct xfs_mount *mp) { int size; int nblks; size = xfs_attr_leaf_entsize_local_max(mp->m_attr_geo->blksize) - MAXNAMELEN - 1; nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK); nblks += XFS_B_TO_FSB(mp, size); nblks += XFS_NEXTENTADD_SPACE_RES(mp, size, XFS_ATTR_FORK); return M_RES(mp)->tr_attrsetm.tr_logres + M_RES(mp)->tr_attrsetrt.tr_logres * nblks; } /* * Iterate over the log space reservation table to figure out and return * the maximum one in terms of the pre-calculated values which were done * at mount time. */ void xfs_log_get_max_trans_res( struct xfs_mount *mp, struct xfs_trans_res *max_resp) { struct xfs_trans_res *resp; struct xfs_trans_res *end_resp; int log_space = 0; int attr_space; attr_space = xfs_log_calc_max_attrsetm_res(mp); resp = (struct xfs_trans_res *)M_RES(mp); end_resp = (struct xfs_trans_res *)(M_RES(mp) + 1); for (; resp < end_resp; resp++) { int tmp = resp->tr_logcount > 1 ? resp->tr_logres * resp->tr_logcount : resp->tr_logres; if (log_space < tmp) { log_space = tmp; *max_resp = *resp; /* struct copy */ } } if (attr_space > log_space) { *max_resp = M_RES(mp)->tr_attrsetm; /* struct copy */ max_resp->tr_logres = attr_space; } } /* * Calculate the minimum valid log size for the given superblock configuration. * Used to calculate the minimum log size at mkfs time, and to determine if * the log is large enough or not at mount time. Returns the minimum size in * filesystem block size units. */ int xfs_log_calc_minimum_size( struct xfs_mount *mp) { struct xfs_trans_res tres = {0}; int max_logres; int min_logblks = 0; int lsunit = 0; xfs_log_get_max_trans_res(mp, &tres); max_logres = xfs_log_calc_unit_res(mp, tres.tr_logres); if (tres.tr_logcount > 1) max_logres *= tres.tr_logcount; if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) lsunit = BTOBB(mp->m_sb.sb_logsunit); /* * Two factors should be taken into account for calculating the minimum * log space. * 1) The fundamental limitation is that no single transaction can be * larger than half size of the log. * * From mkfs.xfs, this is considered by the XFS_MIN_LOG_FACTOR * define, which is set to 3. That means we can definitely fit * maximally sized 2 transactions in the log. We'll use this same * value here. * * 2) If the lsunit option is specified, a transaction requires 2 LSU * for the reservation because there are two log writes that can * require padding - the transaction data and the commit record which * are written separately and both can require padding to the LSU. * Consider that we can have an active CIL reservation holding 2*LSU, * but the CIL is not over a push threshold, in this case, if we * don't have enough log space for at one new transaction, which * includes another 2*LSU in the reservation, we will run into dead * loop situation in log space grant procedure. i.e. * xlog_grant_head_wait(). * * Hence the log size needs to be able to contain two maximally sized * and padded transactions, which is (2 * (2 * LSU + maxlres)). * * Also, the log size should be a multiple of the log stripe unit, round * it up to lsunit boundary if lsunit is specified. */ if (lsunit) { min_logblks = roundup_64(BTOBB(max_logres), lsunit) + 2 * lsunit; } else min_logblks = BTOBB(max_logres) + 2 * BBSIZE; min_logblks *= XFS_MIN_LOG_FACTOR; return XFS_BB_TO_FSB(mp, min_logblks); } |
1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 | // SPDX-License-Identifier: GPL-2.0-only /* * vivid-cec.c - A Virtual Video Test Driver, cec emulation * * Copyright 2016 Cisco Systems, Inc. and/or its affiliates. All rights reserved. */ #include <media/cec.h> #include "vivid-core.h" #include "vivid-cec.h" #define CEC_TIM_START_BIT_TOTAL 4500 #define CEC_TIM_START_BIT_LOW 3700 #define CEC_TIM_START_BIT_HIGH 800 #define CEC_TIM_DATA_BIT_TOTAL 2400 #define CEC_TIM_DATA_BIT_0_LOW 1500 #define CEC_TIM_DATA_BIT_0_HIGH 900 #define CEC_TIM_DATA_BIT_1_LOW 600 #define CEC_TIM_DATA_BIT_1_HIGH 1800 void vivid_cec_bus_free_work(struct vivid_dev *dev) { spin_lock(&dev->cec_slock); while (!list_empty(&dev->cec_work_list)) { struct vivid_cec_work *cw = list_first_entry(&dev->cec_work_list, struct vivid_cec_work, list); spin_unlock(&dev->cec_slock); cancel_delayed_work_sync(&cw->work); spin_lock(&dev->cec_slock); list_del(&cw->list); cec_transmit_attempt_done(cw->adap, CEC_TX_STATUS_LOW_DRIVE); kfree(cw); } spin_unlock(&dev->cec_slock); } static bool vivid_cec_find_dest_adap(struct vivid_dev *dev, struct cec_adapter *adap, u8 dest) { unsigned int i; if (dest >= 0xf) return false; if (adap != dev->cec_rx_adap && dev->cec_rx_adap && dev->cec_rx_adap->is_configured && cec_has_log_addr(dev->cec_rx_adap, dest)) return true; for (i = 0; i < MAX_OUTPUTS && dev->cec_tx_adap[i]; i++) { if (adap == dev->cec_tx_adap[i]) continue; if (!dev->cec_tx_adap[i]->is_configured) continue; if (cec_has_log_addr(dev->cec_tx_adap[i], dest)) return true; } return false; } static void vivid_cec_pin_adap_events(struct cec_adapter *adap, ktime_t ts, const struct cec_msg *msg, bool nacked) { unsigned int len = nacked ? 1 : msg->len; unsigned int i; bool bit; if (adap == NULL) return; /* * Suffix ULL on constant 10 makes the expression * CEC_TIM_START_BIT_TOTAL + 10ULL * len * CEC_TIM_DATA_BIT_TOTAL * to be evaluated using 64-bit unsigned arithmetic (u64), which * is what ktime_sub_us expects as second argument. */ ts = ktime_sub_us(ts, CEC_TIM_START_BIT_TOTAL + 10ULL * len * CEC_TIM_DATA_BIT_TOTAL); cec_queue_pin_cec_event(adap, false, false, ts); ts = ktime_add_us(ts, CEC_TIM_START_BIT_LOW); cec_queue_pin_cec_event(adap, true, false, ts); ts = ktime_add_us(ts, CEC_TIM_START_BIT_HIGH); for (i = 0; i < 10 * len; i++) { switch (i % 10) { case 0 ... 7: bit = msg->msg[i / 10] & (0x80 >> (i % 10)); break; case 8: /* EOM */ bit = i / 10 == msg->len - 1; break; case 9: /* ACK */ bit = cec_msg_is_broadcast(msg) ^ nacked; break; } cec_queue_pin_cec_event(adap, false, false, ts); if (bit) ts = ktime_add_us(ts, CEC_TIM_DATA_BIT_1_LOW); else ts = ktime_add_us(ts, CEC_TIM_DATA_BIT_0_LOW); cec_queue_pin_cec_event(adap, true, false, ts); if (bit) ts = ktime_add_us(ts, CEC_TIM_DATA_BIT_1_HIGH); else ts = ktime_add_us(ts, CEC_TIM_DATA_BIT_0_HIGH); } } static void vivid_cec_pin_events(struct vivid_dev *dev, const struct cec_msg *msg, bool nacked) { ktime_t ts = ktime_get(); unsigned int i; vivid_cec_pin_adap_events(dev->cec_rx_adap, ts, msg, nacked); for (i = 0; i < MAX_OUTPUTS; i++) vivid_cec_pin_adap_events(dev->cec_tx_adap[i], ts, msg, nacked); } static void vivid_cec_xfer_done_worker(struct work_struct *work) { struct vivid_cec_work *cw = container_of(work, struct vivid_cec_work, work.work); struct vivid_dev *dev = cw->dev; struct cec_adapter *adap = cw->adap; u8 dest = cec_msg_destination(&cw->msg); bool valid_dest; unsigned int i; valid_dest = cec_msg_is_broadcast(&cw->msg); if (!valid_dest) valid_dest = vivid_cec_find_dest_adap(dev, adap, dest); cw->tx_status = valid_dest ? CEC_TX_STATUS_OK : CEC_TX_STATUS_NACK; spin_lock(&dev->cec_slock); dev->cec_xfer_time_jiffies = 0; dev->cec_xfer_start_jiffies = 0; list_del(&cw->list); spin_unlock(&dev->cec_slock); vivid_cec_pin_events(dev, &cw->msg, !valid_dest); cec_transmit_attempt_done(cw->adap, cw->tx_status); /* Broadcast message */ if (adap != dev->cec_rx_adap) cec_received_msg(dev->cec_rx_adap, &cw->msg); for (i = 0; i < MAX_OUTPUTS && dev->cec_tx_adap[i]; i++) if (adap != dev->cec_tx_adap[i]) cec_received_msg(dev->cec_tx_adap[i], &cw->msg); kfree(cw); } static void vivid_cec_xfer_try_worker(struct work_struct *work) { struct vivid_cec_work *cw = container_of(work, struct vivid_cec_work, work.work); struct vivid_dev *dev = cw->dev; spin_lock(&dev->cec_slock); if (dev->cec_xfer_time_jiffies) { list_del(&cw->list); spin_unlock(&dev->cec_slock); cec_transmit_attempt_done(cw->adap, CEC_TX_STATUS_ARB_LOST); kfree(cw); } else { INIT_DELAYED_WORK(&cw->work, vivid_cec_xfer_done_worker); dev->cec_xfer_start_jiffies = jiffies; dev->cec_xfer_time_jiffies = usecs_to_jiffies(cw->usecs); spin_unlock(&dev->cec_slock); schedule_delayed_work(&cw->work, dev->cec_xfer_time_jiffies); } } static int vivid_cec_adap_enable(struct cec_adapter *adap, bool enable) { adap->cec_pin_is_high = true; return 0; } static int vivid_cec_adap_log_addr(struct cec_adapter *adap, u8 log_addr) { return 0; } /* * One data bit takes 2400 us, each byte needs 10 bits so that's 24000 us * per byte. */ #define USECS_PER_BYTE 24000 static int vivid_cec_adap_transmit(struct cec_adapter *adap, u8 attempts, u32 signal_free_time, struct cec_msg *msg) { struct vivid_dev *dev = cec_get_drvdata(adap); struct vivid_cec_work *cw = kzalloc(sizeof(*cw), GFP_KERNEL); long delta_jiffies = 0; if (cw == NULL) return -ENOMEM; cw->dev = dev; cw->adap = adap; cw->usecs = CEC_FREE_TIME_TO_USEC(signal_free_time) + msg->len * USECS_PER_BYTE; cw->msg = *msg; spin_lock(&dev->cec_slock); list_add(&cw->list, &dev->cec_work_list); if (dev->cec_xfer_time_jiffies == 0) { INIT_DELAYED_WORK(&cw->work, vivid_cec_xfer_done_worker); dev->cec_xfer_start_jiffies = jiffies; dev->cec_xfer_time_jiffies = usecs_to_jiffies(cw->usecs); delta_jiffies = dev->cec_xfer_time_jiffies; } else { INIT_DELAYED_WORK(&cw->work, vivid_cec_xfer_try_worker); delta_jiffies = dev->cec_xfer_start_jiffies + dev->cec_xfer_time_jiffies - jiffies; } spin_unlock(&dev->cec_slock); schedule_delayed_work(&cw->work, delta_jiffies < 0 ? 0 : delta_jiffies); return 0; } static int vivid_received(struct cec_adapter *adap, struct cec_msg *msg) { struct vivid_dev *dev = cec_get_drvdata(adap); struct cec_msg reply; u8 dest = cec_msg_destination(msg); u8 disp_ctl; char osd[14]; if (cec_msg_is_broadcast(msg)) dest = adap->log_addrs.log_addr[0]; cec_msg_init(&reply, dest, cec_msg_initiator(msg)); switch (cec_msg_opcode(msg)) { case CEC_MSG_SET_OSD_STRING: if (!cec_is_sink(adap)) return -ENOMSG; cec_ops_set_osd_string(msg, &disp_ctl, osd); switch (disp_ctl) { case CEC_OP_DISP_CTL_DEFAULT: strcpy(dev->osd, osd); dev->osd_jiffies = jiffies; break; case CEC_OP_DISP_CTL_UNTIL_CLEARED: strcpy(dev->osd, osd); dev->osd_jiffies = 0; break; case CEC_OP_DISP_CTL_CLEAR: dev->osd[0] = 0; dev->osd_jiffies = 0; break; default: cec_msg_feature_abort(&reply, cec_msg_opcode(msg), CEC_OP_ABORT_INVALID_OP); cec_transmit_msg(adap, &reply, false); break; } break; default: return -ENOMSG; } return 0; } static const struct cec_adap_ops vivid_cec_adap_ops = { .adap_enable = vivid_cec_adap_enable, .adap_log_addr = vivid_cec_adap_log_addr, .adap_transmit = vivid_cec_adap_transmit, .received = vivid_received, }; struct cec_adapter *vivid_cec_alloc_adap(struct vivid_dev *dev, unsigned int idx, bool is_source) { char name[sizeof(dev->vid_out_dev.name) + 2]; u32 caps = CEC_CAP_DEFAULTS | CEC_CAP_MONITOR_ALL | CEC_CAP_MONITOR_PIN; snprintf(name, sizeof(name), "%s%d", is_source ? dev->vid_out_dev.name : dev->vid_cap_dev.name, idx); return cec_allocate_adapter(&vivid_cec_adap_ops, dev, name, caps, 1); } |
1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 | /* SPDX-License-Identifier: GPL-2.0 */ /* * linux/fs/befs/endian.h * * Copyright (C) 2001 Will Dyson <will_dyson@pobox.com> * * Partially based on similar funtions in the sysv driver. */ #ifndef LINUX_BEFS_ENDIAN #define LINUX_BEFS_ENDIAN #include <asm/byteorder.h> static inline u64 fs64_to_cpu(const struct super_block *sb, fs64 n) { if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) return le64_to_cpu((__force __le64)n); else return be64_to_cpu((__force __be64)n); } static inline fs64 cpu_to_fs64(const struct super_block *sb, u64 n) { if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) return (__force fs64)cpu_to_le64(n); else return (__force fs64)cpu_to_be64(n); } static inline u32 fs32_to_cpu(const struct super_block *sb, fs32 n) { if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) return le32_to_cpu((__force __le32)n); else return be32_to_cpu((__force __be32)n); } static inline fs32 cpu_to_fs32(const struct super_block *sb, u32 n) { if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) return (__force fs32)cpu_to_le32(n); else return (__force fs32)cpu_to_be32(n); } static inline u16 fs16_to_cpu(const struct super_block *sb, fs16 n) { if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) return le16_to_cpu((__force __le16)n); else return be16_to_cpu((__force __be16)n); } static inline fs16 cpu_to_fs16(const struct super_block *sb, u16 n) { if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) return (__force fs16)cpu_to_le16(n); else return (__force fs16)cpu_to_be16(n); } /* Composite types below here */ static inline befs_block_run fsrun_to_cpu(const struct super_block *sb, befs_disk_block_run n) { befs_block_run run; if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) { run.allocation_group = le32_to_cpu((__force __le32)n.allocation_group); run.start = le16_to_cpu((__force __le16)n.start); run.len = le16_to_cpu((__force __le16)n.len); } else { run.allocation_group = be32_to_cpu((__force __be32)n.allocation_group); run.start = be16_to_cpu((__force __be16)n.start); run.len = be16_to_cpu((__force __be16)n.len); } return run; } static inline befs_disk_block_run cpu_to_fsrun(const struct super_block *sb, befs_block_run n) { befs_disk_block_run run; if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) { run.allocation_group = cpu_to_le32(n.allocation_group); run.start = cpu_to_le16(n.start); run.len = cpu_to_le16(n.len); } else { run.allocation_group = cpu_to_be32(n.allocation_group); run.start = cpu_to_be16(n.start); run.len = cpu_to_be16(n.len); } return run; } static inline befs_data_stream fsds_to_cpu(const struct super_block *sb, const befs_disk_data_stream *n) { befs_data_stream data; int i; for (i = 0; i < BEFS_NUM_DIRECT_BLOCKS; ++i) data.direct[i] = fsrun_to_cpu(sb, n->direct[i]); data.max_direct_range = fs64_to_cpu(sb, n->max_direct_range); data.indirect = fsrun_to_cpu(sb, n->indirect); data.max_indirect_range = fs64_to_cpu(sb, n->max_indirect_range); data.double_indirect = fsrun_to_cpu(sb, n->double_indirect); data.max_double_indirect_range = fs64_to_cpu(sb, n-> max_double_indirect_range); data.size = fs64_to_cpu(sb, n->size); return data; } #endif //LINUX_BEFS_ENDIAN |
28 28 28 27 28 28 28 28 28 28 2 2 2 2 526 510 526 165 165 165 164 165 165 15 15 4 4 4 77 58 56 50 31 30 2 1 28 28 28 27 28 12 28 28 28 28 28 30 7 5 2 7 31 26 4 2 18 28 28 26 25 26 5 2 13 26 25 25 39 7 3 66 3 3 3 3 2 1 55 26 55 31 48 26 82 66 78 39 11 11 11 375 2 84 84 74 74 82 71 63 36 36 84 82 26 25 26 2 6 4 15 26 26 25 10 10 10 10 10 26 71 70 70 70 70 22 70 46 70 71 70 66 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 | /* * linux/kernel/exit.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/mm.h> #include <linux/slab.h> #include <linux/sched/autogroup.h> #include <linux/sched/mm.h> #include <linux/sched/stat.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> #include <linux/interrupt.h> #include <linux/module.h> #include <linux/capability.h> #include <linux/completion.h> #include <linux/personality.h> #include <linux/tty.h> #include <linux/iocontext.h> #include <linux/key.h> #include <linux/cpu.h> #include <linux/acct.h> #include <linux/tsacct_kern.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/freezer.h> #include <linux/binfmts.h> #include <linux/nsproxy.h> #include <linux/pid_namespace.h> #include <linux/ptrace.h> #include <linux/profile.h> #include <linux/mount.h> #include <linux/proc_fs.h> #include <linux/kthread.h> #include <linux/mempolicy.h> #include <linux/taskstats_kern.h> #include <linux/delayacct.h> #include <linux/cgroup.h> #include <linux/syscalls.h> #include <linux/signal.h> #include <linux/posix-timers.h> #include <linux/cn_proc.h> #include <linux/mutex.h> #include <linux/futex.h> #include <linux/pipe_fs_i.h> #include <linux/audit.h> /* for audit_free() */ #include <linux/resource.h> #include <linux/blkdev.h> #include <linux/task_io_accounting_ops.h> #include <linux/tracehook.h> #include <linux/fs_struct.h> #include <linux/init_task.h> #include <linux/perf_event.h> #include <trace/events/sched.h> #include <linux/hw_breakpoint.h> #include <linux/oom.h> #include <linux/writeback.h> #include <linux/shm.h> #include <linux/kcov.h> #include <linux/random.h> #include <linux/rcuwait.h> #include <linux/compat.h> #include <linux/uaccess.h> #include <asm/unistd.h> #include <asm/pgtable.h> #include <asm/mmu_context.h> static void __unhash_process(struct task_struct *p, bool group_dead) { nr_threads--; detach_pid(p, PIDTYPE_PID); if (group_dead) { detach_pid(p, PIDTYPE_TGID); detach_pid(p, PIDTYPE_PGID); detach_pid(p, PIDTYPE_SID); list_del_rcu(&p->tasks); list_del_init(&p->sibling); __this_cpu_dec(process_counts); } list_del_rcu(&p->thread_group); list_del_rcu(&p->thread_node); } /* * This function expects the tasklist_lock write-locked. */ static void __exit_signal(struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; bool group_dead = thread_group_leader(tsk); struct sighand_struct *sighand; struct tty_struct *uninitialized_var(tty); u64 utime, stime; sighand = rcu_dereference_check(tsk->sighand, lockdep_tasklist_lock_is_held()); spin_lock(&sighand->siglock); #ifdef CONFIG_POSIX_TIMERS posix_cpu_timers_exit(tsk); if (group_dead) { posix_cpu_timers_exit_group(tsk); } else { /* * This can only happen if the caller is de_thread(). * FIXME: this is the temporary hack, we should teach * posix-cpu-timers to handle this case correctly. */ if (unlikely(has_group_leader_pid(tsk))) posix_cpu_timers_exit_group(tsk); } #endif if (group_dead) { tty = sig->tty; sig->tty = NULL; } else { /* * If there is any task waiting for the group exit * then notify it: */ if (sig->notify_count > 0 && !--sig->notify_count) wake_up_process(sig->group_exit_task); if (tsk == sig->curr_target) sig->curr_target = next_thread(tsk); } add_device_randomness((const void*) &tsk->se.sum_exec_runtime, sizeof(unsigned long long)); /* * Accumulate here the counters for all threads as they die. We could * skip the group leader because it is the last user of signal_struct, * but we want to avoid the race with thread_group_cputime() which can * see the empty ->thread_head list. */ task_cputime(tsk, &utime, &stime); write_seqlock(&sig->stats_lock); sig->utime += utime; sig->stime += stime; sig->gtime += task_gtime(tsk); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; sig->nivcsw += tsk->nivcsw; sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); task_io_accounting_add(&sig->ioac, &tsk->ioac); sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig->nr_threads--; __unhash_process(tsk, group_dead); write_sequnlock(&sig->stats_lock); /* * Do this under ->siglock, we can race with another thread * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. */ flush_sigqueue(&tsk->pending); tsk->sighand = NULL; spin_unlock(&sighand->siglock); __cleanup_sighand(sighand); clear_tsk_thread_flag(tsk, TIF_SIGPENDING); if (group_dead) { flush_sigqueue(&sig->shared_pending); tty_kref_put(tty); } } static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); perf_event_delayed_put(tsk); trace_sched_process_free(tsk); put_task_struct(tsk); } void release_task(struct task_struct *p) { struct task_struct *leader; int zap_leader; repeat: /* don't need to get the RCU readlock here - the process is dead and * can't be modifying its own credentials. But shut RCU-lockdep up */ rcu_read_lock(); atomic_dec(&__task_cred(p)->user->processes); rcu_read_unlock(); proc_flush_task(p); cgroup_release(p); write_lock_irq(&tasklist_lock); ptrace_release_task(p); __exit_signal(p); /* * If we are the last non-leader member of the thread * group, and the leader is zombie, then notify the * group leader's parent process. (if it wants notification.) */ zap_leader = 0; leader = p->group_leader; if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { /* * If we were the last child thread and the leader has * exited already, and the leader's parent ignores SIGCHLD, * then we are the one who should release the leader. */ zap_leader = do_notify_parent(leader, leader->exit_signal); if (zap_leader) leader->exit_state = EXIT_DEAD; } write_unlock_irq(&tasklist_lock); release_thread(p); call_rcu(&p->rcu, delayed_put_task_struct); p = leader; if (unlikely(zap_leader)) goto repeat; } /* * Note that if this function returns a valid task_struct pointer (!NULL) * task->usage must remain >0 for the duration of the RCU critical section. */ struct task_struct *task_rcu_dereference(struct task_struct **ptask) { struct sighand_struct *sighand; struct task_struct *task; /* * We need to verify that release_task() was not called and thus * delayed_put_task_struct() can't run and drop the last reference * before rcu_read_unlock(). We check task->sighand != NULL, * but we can read the already freed and reused memory. */ retry: task = rcu_dereference(*ptask); if (!task) return NULL; probe_kernel_address(&task->sighand, sighand); /* * Pairs with atomic_dec_and_test() in put_task_struct(). If this task * was already freed we can not miss the preceding update of this * pointer. */ smp_rmb(); if (unlikely(task != READ_ONCE(*ptask))) goto retry; /* * We've re-checked that "task == *ptask", now we have two different * cases: * * 1. This is actually the same task/task_struct. In this case * sighand != NULL tells us it is still alive. * * 2. This is another task which got the same memory for task_struct. * We can't know this of course, and we can not trust * sighand != NULL. * * In this case we actually return a random value, but this is * correct. * * If we return NULL - we can pretend that we actually noticed that * *ptask was updated when the previous task has exited. Or pretend * that probe_slab_address(&sighand) reads NULL. * * If we return the new task (because sighand is not NULL for any * reason) - this is fine too. This (new) task can't go away before * another gp pass. * * And note: We could even eliminate the false positive if re-read * task->sighand once again to avoid the falsely NULL. But this case * is very unlikely so we don't care. */ if (!sighand) return NULL; return task; } void rcuwait_wake_up(struct rcuwait *w) { struct task_struct *task; rcu_read_lock(); /* * Order condition vs @task, such that everything prior to the load * of @task is visible. This is the condition as to why the user called * rcuwait_trywake() in the first place. Pairs with set_current_state() * barrier (A) in rcuwait_wait_event(). * * WAIT WAKE * [S] tsk = current [S] cond = true * MB (A) MB (B) * [L] cond [L] tsk */ smp_mb(); /* (B) */ /* * Avoid using task_rcu_dereference() magic as long as we are careful, * see comment in rcuwait_wait_event() regarding ->exit_state. */ task = rcu_dereference(w->task); if (task) wake_up_process(task); rcu_read_unlock(); } /* * Determine if a process group is "orphaned", according to the POSIX * definition in 2.2.2.52. Orphaned process groups are not to be affected * by terminal-generated stop signals. Newly orphaned process groups are * to receive a SIGHUP and a SIGCONT. * * "I ask you, have you ever known what it is to be an orphan?" */ static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) { struct task_struct *p; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { if ((p == ignored_task) || (p->exit_state && thread_group_empty(p)) || is_global_init(p->real_parent)) continue; if (task_pgrp(p->real_parent) != pgrp && task_session(p->real_parent) == task_session(p)) return 0; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); return 1; } int is_current_pgrp_orphaned(void) { int retval; read_lock(&tasklist_lock); retval = will_become_orphaned_pgrp(task_pgrp(current), NULL); read_unlock(&tasklist_lock); return retval; } static bool has_stopped_jobs(struct pid *pgrp) { struct task_struct *p; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { if (p->signal->flags & SIGNAL_STOP_STOPPED) return true; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); return false; } /* * Check to see if any process groups have become orphaned as * a result of our exiting, and if they have any stopped jobs, * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) */ static void kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) { struct pid *pgrp = task_pgrp(tsk); struct task_struct *ignored_task = tsk; if (!parent) /* exit: our father is in a different pgrp than * we are and we were the only connection outside. */ parent = tsk->real_parent; else /* reparent: our child is in a different pgrp than * we are, and it was the only connection outside. */ ignored_task = NULL; if (task_pgrp(parent) != pgrp && task_session(parent) == task_session(tsk) && will_become_orphaned_pgrp(pgrp, ignored_task) && has_stopped_jobs(pgrp)) { __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); } } #ifdef CONFIG_MEMCG /* * A task is exiting. If it owned this mm, find a new owner for the mm. */ void mm_update_next_owner(struct mm_struct *mm) { struct task_struct *c, *g, *p = current; retry: /* * If the exiting or execing task is not the owner, it's * someone else's problem. */ if (mm->owner != p) return; /* * The current owner is exiting/execing and there are no other * candidates. Do not leave the mm pointing to a possibly * freed task structure. */ if (atomic_read(&mm->mm_users) <= 1) { mm->owner = NULL; return; } read_lock(&tasklist_lock); /* * Search in the children */ list_for_each_entry(c, &p->children, sibling) { if (c->mm == mm) goto assign_new_owner; } /* * Search in the siblings */ list_for_each_entry(c, &p->real_parent->children, sibling) { if (c->mm == mm) goto assign_new_owner; } /* * Search through everything else, we should not get here often. */ for_each_process(g) { if (g->flags & PF_KTHREAD) continue; for_each_thread(g, c) { if (c->mm == mm) goto assign_new_owner; if (c->mm) break; } } read_unlock(&tasklist_lock); /* * We found no owner yet mm_users > 1: this implies that we are * most likely racing with swapoff (try_to_unuse()) or /proc or * ptrace or page migration (get_task_mm()). Mark owner as NULL. */ mm->owner = NULL; return; assign_new_owner: BUG_ON(c == p); get_task_struct(c); /* * The task_lock protects c->mm from changing. * We always want mm->owner->mm == mm */ task_lock(c); /* * Delay read_unlock() till we have the task_lock() * to ensure that c does not slip away underneath us */ read_unlock(&tasklist_lock); if (c->mm != mm) { task_unlock(c); put_task_struct(c); goto retry; } mm->owner = c; task_unlock(c); put_task_struct(c); } #endif /* CONFIG_MEMCG */ /* * Turn us into a lazy TLB process if we * aren't already.. */ static void exit_mm(void) { struct mm_struct *mm = current->mm; struct core_state *core_state; exit_mm_release(current, mm); if (!mm) return; sync_mm_rss(mm); /* * Serialize with any possible pending coredump. * We must hold mmap_sem around checking core_state * and clearing tsk->mm. The core-inducing thread * will increment ->nr_threads for each thread in the * group with ->mm != NULL. */ down_read(&mm->mmap_sem); core_state = mm->core_state; if (core_state) { struct core_thread self; up_read(&mm->mmap_sem); self.task = current; if (self.task->flags & PF_SIGNALED) self.next = xchg(&core_state->dumper.next, &self); else self.task = NULL; /* * Implies mb(), the result of xchg() must be visible * to core_state->dumper. */ if (atomic_dec_and_test(&core_state->nr_threads)) complete(&core_state->startup); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (!self.task) /* see coredump_finish() */ break; freezable_schedule(); } __set_current_state(TASK_RUNNING); down_read(&mm->mmap_sem); } mmgrab(mm); BUG_ON(mm != current->active_mm); /* more a memory barrier than a real lock */ task_lock(current); current->mm = NULL; up_read(&mm->mmap_sem); enter_lazy_tlb(mm, current); task_unlock(current); mm_update_next_owner(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) exit_oom_victim(); } static struct task_struct *find_alive_thread(struct task_struct *p) { struct task_struct *t; for_each_thread(p, t) { if (!(t->flags & PF_EXITING)) return t; } return NULL; } static struct task_struct *find_child_reaper(struct task_struct *father, struct list_head *dead) __releases(&tasklist_lock) __acquires(&tasklist_lock) { struct pid_namespace *pid_ns = task_active_pid_ns(father); struct task_struct *reaper = pid_ns->child_reaper; struct task_struct *p, *n; if (likely(reaper != father)) return reaper; reaper = find_alive_thread(father); if (reaper) { pid_ns->child_reaper = reaper; return reaper; } write_unlock_irq(&tasklist_lock); list_for_each_entry_safe(p, n, dead, ptrace_entry) { list_del_init(&p->ptrace_entry); release_task(p); } zap_pid_ns_processes(pid_ns); write_lock_irq(&tasklist_lock); return father; } /* * When we die, we re-parent all our children, and try to: * 1. give them to another thread in our thread group, if such a member exists * 2. give it to the first ancestor process which prctl'd itself as a * child_subreaper for its children (like a service manager) * 3. give it to the init process (PID 1) in our pid namespace */ static struct task_struct *find_new_reaper(struct task_struct *father, struct task_struct *child_reaper) { struct task_struct *thread, *reaper; thread = find_alive_thread(father); if (thread) return thread; if (father->signal->has_child_subreaper) { unsigned int ns_level = task_pid(father)->level; /* * Find the first ->is_child_subreaper ancestor in our pid_ns. * We can't check reaper != child_reaper to ensure we do not * cross the namespaces, the exiting parent could be injected * by setns() + fork(). * We check pid->level, this is slightly more efficient than * task_active_pid_ns(reaper) != task_active_pid_ns(father). */ for (reaper = father->real_parent; task_pid(reaper)->level == ns_level; reaper = reaper->real_parent) { if (reaper == &init_task) break; if (!reaper->signal->is_child_subreaper) continue; thread = find_alive_thread(reaper); if (thread) return thread; } } return child_reaper; } /* * Any that need to be release_task'd are put on the @dead list. */ static void reparent_leader(struct task_struct *father, struct task_struct *p, struct list_head *dead) { if (unlikely(p->exit_state == EXIT_DEAD)) return; /* We don't want people slaying init. */ p->exit_signal = SIGCHLD; /* If it has exited notify the new parent about this child's death. */ if (!p->ptrace && p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { if (do_notify_parent(p, p->exit_signal)) { p->exit_state = EXIT_DEAD; list_add(&p->ptrace_entry, dead); } } kill_orphaned_pgrp(p, father); } /* * This does two things: * * A. Make init inherit all the child processes * B. Check to see if any process groups have become orphaned * as a result of our exiting, and if they have any stopped * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) */ static void forget_original_parent(struct task_struct *father, struct list_head *dead) { struct task_struct *p, *t, *reaper; if (unlikely(!list_empty(&father->ptraced))) exit_ptrace(father, dead); /* Can drop and reacquire tasklist_lock */ reaper = find_child_reaper(father, dead); if (list_empty(&father->children)) return; reaper = find_new_reaper(father, reaper); list_for_each_entry(p, &father->children, sibling) { for_each_thread(p, t) { t->real_parent = reaper; BUG_ON((!t->ptrace) != (t->parent == father)); if (likely(!t->ptrace)) t->parent = t->real_parent; if (t->pdeath_signal) group_send_sig_info(t->pdeath_signal, SEND_SIG_NOINFO, t, PIDTYPE_TGID); } /* * If this is a threaded reparent there is no need to * notify anyone anything has happened. */ if (!same_thread_group(reaper, father)) reparent_leader(father, p, dead); } list_splice_tail_init(&father->children, &reaper->children); } /* * Send signals to all our closest relatives so that they know * to properly mourn us.. */ static void exit_notify(struct task_struct *tsk, int group_dead) { bool autoreap; struct task_struct *p, *n; LIST_HEAD(dead); write_lock_irq(&tasklist_lock); forget_original_parent(tsk, &dead); if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); if (unlikely(tsk->ptrace)) { int sig = thread_group_leader(tsk) && thread_group_empty(tsk) && !ptrace_reparented(tsk) ? tsk->exit_signal : SIGCHLD; autoreap = do_notify_parent(tsk, sig); } else if (thread_group_leader(tsk)) { autoreap = thread_group_empty(tsk) && do_notify_parent(tsk, tsk->exit_signal); } else { autoreap = true; } tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE; if (tsk->exit_state == EXIT_DEAD) list_add(&tsk->ptrace_entry, &dead); /* mt-exec, de_thread() is waiting for group leader */ if (unlikely(tsk->signal->notify_count < 0)) wake_up_process(tsk->signal->group_exit_task); write_unlock_irq(&tasklist_lock); list_for_each_entry_safe(p, n, &dead, ptrace_entry) { list_del_init(&p->ptrace_entry); release_task(p); } } #ifdef CONFIG_DEBUG_STACK_USAGE static void check_stack_usage(void) { static DEFINE_SPINLOCK(low_water_lock); static int lowest_to_date = THREAD_SIZE; unsigned long free; free = stack_not_used(current); if (free >= lowest_to_date) return; spin_lock(&low_water_lock); if (free < lowest_to_date) { pr_info("%s (%d) used greatest stack depth: %lu bytes left\n", current->comm, task_pid_nr(current), free); lowest_to_date = free; } spin_unlock(&low_water_lock); } #else static inline void check_stack_usage(void) {} #endif void __noreturn do_exit(long code) { struct task_struct *tsk = current; int group_dead; /* * We can get here from a kernel oops, sometimes with preemption off. * Start by checking for critical errors. * Then fix up important state like USER_DS and preemption. * Then do everything else. */ WARN_ON(blk_needs_flush_plug(tsk)); if (unlikely(in_interrupt())) panic("Aiee, killing interrupt handler!"); if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); /* * If do_exit is called because this processes oopsed, it's possible * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before * continuing. Amongst other possible reasons, this is to prevent * mm_release()->clear_child_tid() from writing to a user-controlled * kernel address. */ set_fs(USER_DS); if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", current->comm, task_pid_nr(current), preempt_count()); preempt_count_set(PREEMPT_ENABLED); } profile_task_exit(tsk); kcov_task_exit(tsk); ptrace_event(PTRACE_EVENT_EXIT, code); validate_creds_for_do_exit(tsk); /* * We're taking recursive faults here in do_exit. Safest is to just * leave this task alone and wait for reboot. */ if (unlikely(tsk->flags & PF_EXITING)) { pr_alert("Fixing recursive fault but reboot is needed!\n"); futex_exit_recursive(tsk); set_current_state(TASK_UNINTERRUPTIBLE); schedule(); } exit_signals(tsk); /* sets PF_EXITING */ /* sync mm's RSS info before statistics gathering */ if (tsk->mm) sync_mm_rss(tsk->mm); acct_update_integrals(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { /* * If the last thread of global init has exited, panic * immediately to get a useable coredump. */ if (unlikely(is_global_init(tsk))) panic("Attempted to kill init! exitcode=0x%08x\n", tsk->signal->group_exit_code ?: (int)code); #ifdef CONFIG_POSIX_TIMERS hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); #endif if (tsk->mm) setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); } acct_collect(code, group_dead); if (group_dead) tty_audit_exit(); audit_free(tsk); tsk->exit_code = code; taskstats_exit(tsk, group_dead); exit_mm(); if (group_dead) acct_process(); trace_sched_process_exit(tsk); exit_sem(tsk); exit_shm(tsk); exit_files(tsk); exit_fs(tsk); if (group_dead) disassociate_ctty(1); exit_task_namespaces(tsk); exit_task_work(tsk); exit_thread(tsk); /* * Flush inherited counters to the parent - before the parent * gets woken up by child-exit notifications. * * because of cgroup mode, must be called before cgroup_exit() */ perf_event_exit_task(tsk); sched_autogroup_exit_task(tsk); cgroup_exit(tsk); /* * FIXME: do that only when needed, using sched_exit tracepoint */ flush_ptrace_hw_breakpoint(tsk); exit_tasks_rcu_start(); exit_notify(tsk, group_dead); proc_exit_connector(tsk); mpol_put_task_policy(tsk); #ifdef CONFIG_FUTEX if (unlikely(current->pi_state_cache)) kfree(current->pi_state_cache); #endif /* * Make sure we are holding no locks: */ debug_check_no_locks_held(); if (tsk->io_context) exit_io_context(tsk); if (tsk->splice_pipe) free_pipe_info(tsk->splice_pipe); if (tsk->task_frag.page) put_page(tsk->task_frag.page); validate_creds_for_do_exit(tsk); check_stack_usage(); preempt_disable(); if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); exit_tasks_rcu_finish(); lockdep_free_task(tsk); do_task_dead(); } EXPORT_SYMBOL_GPL(do_exit); void complete_and_exit(struct completion *comp, long code) { if (comp) complete(comp); do_exit(code); } EXPORT_SYMBOL(complete_and_exit); SYSCALL_DEFINE1(exit, int, error_code) { do_exit((error_code&0xff)<<8); } /* * Take down every thread in the group. This is called by fatal signals * as well as by sys_exit_group (below). */ void do_group_exit(int exit_code) { struct signal_struct *sig = current->signal; BUG_ON(exit_code & 0x80); /* core dumps don't get here */ if (signal_group_exit(sig)) exit_code = sig->group_exit_code; else if (!thread_group_empty(current)) { struct sighand_struct *const sighand = current->sighand; spin_lock_irq(&sighand->siglock); if (signal_group_exit(sig)) /* Another thread got here before we took the lock. */ exit_code = sig->group_exit_code; else { sig->group_exit_code = exit_code; sig->flags = SIGNAL_GROUP_EXIT; zap_other_threads(current); } spin_unlock_irq(&sighand->siglock); } do_exit(exit_code); /* NOTREACHED */ } /* * this kills every thread in the thread group. Note that any externally * wait4()-ing process will get the correct exit code - even if this * thread is not the thread group leader. */ SYSCALL_DEFINE1(exit_group, int, error_code) { do_group_exit((error_code & 0xff) << 8); /* NOTREACHED */ return 0; } struct waitid_info { pid_t pid; uid_t uid; int status; int cause; }; struct wait_opts { enum pid_type wo_type; int wo_flags; struct pid *wo_pid; struct waitid_info *wo_info; int wo_stat; struct rusage *wo_rusage; wait_queue_entry_t child_wait; int notask_error; }; static int eligible_pid(struct wait_opts *wo, struct task_struct *p) { return wo->wo_type == PIDTYPE_MAX || task_pid_type(p, wo->wo_type) == wo->wo_pid; } static int eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p) { if (!eligible_pid(wo, p)) return 0; /* * Wait for all children (clone and not) if __WALL is set or * if it is traced by us. */ if (ptrace || (wo->wo_flags & __WALL)) return 1; /* * Otherwise, wait for clone children *only* if __WCLONE is set; * otherwise, wait for non-clone children *only*. * * Note: a "clone" child here is one that reports to its parent * using a signal other than SIGCHLD, or a non-leader thread which * we can only see if it is traced by us. */ if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) return 0; return 1; } /* * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold * read_lock(&tasklist_lock) on entry. If we return zero, we still hold * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) { int state, status; pid_t pid = task_pid_vnr(p); uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); struct waitid_info *infop; if (!likely(wo->wo_flags & WEXITED)) return 0; if (unlikely(wo->wo_flags & WNOWAIT)) { status = p->exit_code; get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); goto out_info; } /* * Move the task's state to DEAD/TRACE, only one thread can do this. */ state = (ptrace_reparented(p) && thread_group_leader(p)) ? EXIT_TRACE : EXIT_DEAD; if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) return 0; /* * We own this thread, nobody else can reap it. */ read_unlock(&tasklist_lock); sched_annotate_sleep(); /* * Check thread_group_leader() to exclude the traced sub-threads. */ if (state == EXIT_DEAD && thread_group_leader(p)) { struct signal_struct *sig = p->signal; struct signal_struct *psig = current->signal; unsigned long maxrss; u64 tgutime, tgstime; /* * The resource counters for the group leader are in its * own task_struct. Those for dead threads in the group * are in its signal_struct, as are those for the child * processes it has previously reaped. All these * accumulate in the parent's signal_struct c* fields. * * We don't bother to take a lock here to protect these * p->signal fields because the whole thread group is dead * and nobody can change them. * * psig->stats_lock also protects us from our sub-theads * which can reap other children at the same time. Until * we change k_getrusage()-like users to rely on this lock * we have to take ->siglock as well. * * We use thread_group_cputime_adjusted() to get times for * the thread group, which consolidates times for all threads * in the group including the group leader. */ thread_group_cputime_adjusted(p, &tgutime, &tgstime); spin_lock_irq(¤t->sighand->siglock); write_seqlock(&psig->stats_lock); psig->cutime += tgutime + sig->cutime; psig->cstime += tgstime + sig->cstime; psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; psig->cmin_flt += p->min_flt + sig->min_flt + sig->cmin_flt; psig->cmaj_flt += p->maj_flt + sig->maj_flt + sig->cmaj_flt; psig->cnvcsw += p->nvcsw + sig->nvcsw + sig->cnvcsw; psig->cnivcsw += p->nivcsw + sig->nivcsw + sig->cnivcsw; psig->cinblock += task_io_get_inblock(p) + sig->inblock + sig->cinblock; psig->coublock += task_io_get_oublock(p) + sig->oublock + sig->coublock; maxrss = max(sig->maxrss, sig->cmaxrss); if (psig->cmaxrss < maxrss) psig->cmaxrss = maxrss; task_io_accounting_add(&psig->ioac, &p->ioac); task_io_accounting_add(&psig->ioac, &sig->ioac); write_sequnlock(&psig->stats_lock); spin_unlock_irq(¤t->sighand->siglock); } if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; wo->wo_stat = status; if (state == EXIT_TRACE) { write_lock_irq(&tasklist_lock); /* We dropped tasklist, ptracer could die and untrace */ ptrace_unlink(p); /* If parent wants a zombie, don't release it now */ state = EXIT_ZOMBIE; if (do_notify_parent(p, p->exit_signal)) state = EXIT_DEAD; p->exit_state = state; write_unlock_irq(&tasklist_lock); } if (state == EXIT_DEAD) release_task(p); out_info: infop = wo->wo_info; if (infop) { if ((status & 0x7f) == 0) { infop->cause = CLD_EXITED; infop->status = status >> 8; } else { infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; infop->status = status & 0x7f; } infop->pid = pid; infop->uid = uid; } return pid; } static int *task_stopped_code(struct task_struct *p, bool ptrace) { if (ptrace) { if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING)) return &p->exit_code; } else { if (p->signal->flags & SIGNAL_STOP_STOPPED) return &p->signal->group_exit_code; } return NULL; } /** * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED * @wo: wait options * @ptrace: is the wait for ptrace * @p: task to wait for * * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED. * * CONTEXT: * read_lock(&tasklist_lock), which is released if return value is * non-zero. Also, grabs and releases @p->sighand->siglock. * * RETURNS: * 0 if wait condition didn't exist and search for other wait conditions * should continue. Non-zero return, -errno on failure and @p's pid on * success, implies that tasklist_lock is released and wait condition * search should terminate. */ static int wait_task_stopped(struct wait_opts *wo, int ptrace, struct task_struct *p) { struct waitid_info *infop; int exit_code, *p_code, why; uid_t uid = 0; /* unneeded, required by compiler */ pid_t pid; /* * Traditionally we see ptrace'd stopped tasks regardless of options. */ if (!ptrace && !(wo->wo_flags & WUNTRACED)) return 0; if (!task_stopped_code(p, ptrace)) return 0; exit_code = 0; spin_lock_irq(&p->sighand->siglock); p_code = task_stopped_code(p, ptrace); if (unlikely(!p_code)) goto unlock_sig; exit_code = *p_code; if (!exit_code) goto unlock_sig; if (!unlikely(wo->wo_flags & WNOWAIT)) *p_code = 0; uid = from_kuid_munged(current_user_ns(), task_uid(p)); unlock_sig: spin_unlock_irq(&p->sighand->siglock); if (!exit_code) return 0; /* * Now we are pretty sure this task is interesting. * Make sure it doesn't get reaped out from under us while we * give up the lock and then examine it below. We don't want to * keep holding onto the tasklist_lock while we call getrusage and * possibly take page faults for user memory. */ get_task_struct(p); pid = task_pid_vnr(p); why = ptrace ? CLD_TRAPPED : CLD_STOPPED; read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); if (likely(!(wo->wo_flags & WNOWAIT))) wo->wo_stat = (exit_code << 8) | 0x7f; infop = wo->wo_info; if (infop) { infop->cause = why; infop->status = exit_code; infop->pid = pid; infop->uid = uid; } return pid; } /* * Handle do_wait work for one task in a live, non-stopped state. * read_lock(&tasklist_lock) on entry. If we return zero, we still hold * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) { struct waitid_info *infop; pid_t pid; uid_t uid; if (!unlikely(wo->wo_flags & WCONTINUED)) return 0; if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) return 0; spin_lock_irq(&p->sighand->siglock); /* Re-check with the lock held. */ if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) { spin_unlock_irq(&p->sighand->siglock); return 0; } if (!unlikely(wo->wo_flags & WNOWAIT)) p->signal->flags &= ~SIGNAL_STOP_CONTINUED; uid = from_kuid_munged(current_user_ns(), task_uid(p)); spin_unlock_irq(&p->sighand->siglock); pid = task_pid_vnr(p); get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); infop = wo->wo_info; if (!infop) { wo->wo_stat = 0xffff; } else { infop->cause = CLD_CONTINUED; infop->pid = pid; infop->uid = uid; infop->status = SIGCONT; } return pid; } /* * Consider @p for a wait by @parent. * * -ECHILD should be in ->notask_error before the first call. * Returns nonzero for a final return, when we have unlocked tasklist_lock. * Returns zero if the search for a child should continue; * then ->notask_error is 0 if @p is an eligible child, * or still -ECHILD. */ static int wait_consider_task(struct wait_opts *wo, int ptrace, struct task_struct *p) { /* * We can race with wait_task_zombie() from another thread. * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition * can't confuse the checks below. */ int exit_state = READ_ONCE(p->exit_state); int ret; if (unlikely(exit_state == EXIT_DEAD)) return 0; ret = eligible_child(wo, ptrace, p); if (!ret) return ret; if (unlikely(exit_state == EXIT_TRACE)) { /* * ptrace == 0 means we are the natural parent. In this case * we should clear notask_error, debugger will notify us. */ if (likely(!ptrace)) wo->notask_error = 0; return 0; } if (likely(!ptrace) && unlikely(p->ptrace)) { /* * If it is traced by its real parent's group, just pretend * the caller is ptrace_do_wait() and reap this child if it * is zombie. * * This also hides group stop state from real parent; otherwise * a single stop can be reported twice as group and ptrace stop. * If a ptracer wants to distinguish these two events for its * own children it should create a separate process which takes * the role of real parent. */ if (!ptrace_reparented(p)) ptrace = 1; } /* slay zombie? */ if (exit_state == EXIT_ZOMBIE) { /* we don't reap group leaders with subthreads */ if (!delay_group_leader(p)) { /* * A zombie ptracee is only visible to its ptracer. * Notification and reaping will be cascaded to the * real parent when the ptracer detaches. */ if (unlikely(ptrace) || likely(!p->ptrace)) return wait_task_zombie(wo, p); } /* * Allow access to stopped/continued state via zombie by * falling through. Clearing of notask_error is complex. * * When !@ptrace: * * If WEXITED is set, notask_error should naturally be * cleared. If not, subset of WSTOPPED|WCONTINUED is set, * so, if there are live subthreads, there are events to * wait for. If all subthreads are dead, it's still safe * to clear - this function will be called again in finite * amount time once all the subthreads are released and * will then return without clearing. * * When @ptrace: * * Stopped state is per-task and thus can't change once the * target task dies. Only continued and exited can happen. * Clear notask_error if WCONTINUED | WEXITED. */ if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED))) wo->notask_error = 0; } else { /* * @p is alive and it's gonna stop, continue or exit, so * there always is something to wait for. */ wo->notask_error = 0; } /* * Wait for stopped. Depending on @ptrace, different stopped state * is used and the two don't interact with each other. */ ret = wait_task_stopped(wo, ptrace, p); if (ret) return ret; /* * Wait for continued. There's only one continued state and the * ptracer can consume it which can confuse the real parent. Don't * use WCONTINUED from ptracer. You don't need or want it. */ return wait_task_continued(wo, p); } /* * Do the work of do_wait() for one thread in the group, @tsk. * * -ECHILD should be in ->notask_error before the first call. * Returns nonzero for a final return, when we have unlocked tasklist_lock. * Returns zero if the search for a child should continue; then * ->notask_error is 0 if there were any eligible children, * or still -ECHILD. */ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) { struct task_struct *p; list_for_each_entry(p, &tsk->children, sibling) { int ret = wait_consider_task(wo, 0, p); if (ret) return ret; } return 0; } static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) { struct task_struct *p; list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { int ret = wait_consider_task(wo, 1, p); if (ret) return ret; } return 0; } static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct wait_opts *wo = container_of(wait, struct wait_opts, child_wait); struct task_struct *p = key; if (!eligible_pid(wo, p)) return 0; if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent) return 0; return default_wake_function(wait, mode, sync, key); } void __wake_up_parent(struct task_struct *p, struct task_struct *parent) { __wake_up_sync_key(&parent->signal->wait_chldexit, TASK_INTERRUPTIBLE, 1, p); } static long do_wait(struct wait_opts *wo) { struct task_struct *tsk; int retval; trace_sched_process_wait(wo->wo_pid); init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); wo->child_wait.private = current; add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); repeat: /* * If there is nothing that can match our criteria, just get out. * We will clear ->notask_error to zero if we see any child that * might later match our criteria, even if we are not able to reap * it yet. */ wo->notask_error = -ECHILD; if ((wo->wo_type < PIDTYPE_MAX) && (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type]))) goto notask; set_current_state(TASK_INTERRUPTIBLE); read_lock(&tasklist_lock); tsk = current; do { retval = do_wait_thread(wo, tsk); if (retval) goto end; retval = ptrace_do_wait(wo, tsk); if (retval) goto end; if (wo->wo_flags & __WNOTHREAD) break; } while_each_thread(current, tsk); read_unlock(&tasklist_lock); notask: retval = wo->notask_error; if (!retval && !(wo->wo_flags & WNOHANG)) { retval = -ERESTARTSYS; if (!signal_pending(current)) { schedule(); goto repeat; } } end: __set_current_state(TASK_RUNNING); remove_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); return retval; } static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, int options, struct rusage *ru) { struct wait_opts wo; struct pid *pid = NULL; enum pid_type type; long ret; if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED| __WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) return -EINVAL; switch (which) { case P_ALL: type = PIDTYPE_MAX; break; case P_PID: type = PIDTYPE_PID; if (upid <= 0) return -EINVAL; break; case P_PGID: type = PIDTYPE_PGID; if (upid <= 0) return -EINVAL; break; default: return -EINVAL; } if (type < PIDTYPE_MAX) pid = find_get_pid(upid); wo.wo_type = type; wo.wo_pid = pid; wo.wo_flags = options; wo.wo_info = infop; wo.wo_rusage = ru; ret = do_wait(&wo); put_pid(pid); return ret; } SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, infop, int, options, struct rusage __user *, ru) { struct rusage r; struct waitid_info info = {.status = 0}; long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL); int signo = 0; if (err > 0) { signo = SIGCHLD; err = 0; if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) return -EFAULT; } if (!infop) return err; if (!user_access_begin(VERIFY_WRITE, infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user(info.cause, &infop->si_code, Efault); unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); user_access_end(); return err; Efault: user_access_end(); return -EFAULT; } long kernel_wait4(pid_t upid, int __user *stat_addr, int options, struct rusage *ru) { struct wait_opts wo; struct pid *pid = NULL; enum pid_type type; long ret; if (options & ~(WNOHANG|WUNTRACED|WCONTINUED| __WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; /* -INT_MIN is not defined */ if (upid == INT_MIN) return -ESRCH; if (upid == -1) type = PIDTYPE_MAX; else if (upid < 0) { type = PIDTYPE_PGID; pid = find_get_pid(-upid); } else if (upid == 0) { type = PIDTYPE_PGID; pid = get_task_pid(current, PIDTYPE_PGID); } else /* upid > 0 */ { type = PIDTYPE_PID; pid = find_get_pid(upid); } wo.wo_type = type; wo.wo_pid = pid; wo.wo_flags = options | WEXITED; wo.wo_info = NULL; wo.wo_stat = 0; wo.wo_rusage = ru; ret = do_wait(&wo); put_pid(pid); if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr)) ret = -EFAULT; return ret; } SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, int, options, struct rusage __user *, ru) { struct rusage r; long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL); if (err > 0) { if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) return -EFAULT; } return err; } #ifdef __ARCH_WANT_SYS_WAITPID /* * sys_waitpid() remains for compatibility. waitpid() should be * implemented by calling sys_wait4() from libc.a. */ SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) { return kernel_wait4(pid, stat_addr, options, NULL); } #endif #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(wait4, compat_pid_t, pid, compat_uint_t __user *, stat_addr, int, options, struct compat_rusage __user *, ru) { struct rusage r; long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL); if (err > 0) { if (ru && put_compat_rusage(&r, ru)) return -EFAULT; } return err; } COMPAT_SYSCALL_DEFINE5(waitid, int, which, compat_pid_t, pid, struct compat_siginfo __user *, infop, int, options, struct compat_rusage __user *, uru) { struct rusage ru; struct waitid_info info = {.status = 0}; long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL); int signo = 0; if (err > 0) { signo = SIGCHLD; err = 0; if (uru) { /* kernel_waitid() overwrites everything in ru */ if (COMPAT_USE_64BIT_TIME) err = copy_to_user(uru, &ru, sizeof(ru)); else err = put_compat_rusage(&ru, uru); if (err) return -EFAULT; } } if (!infop) return err; if (!user_access_begin(VERIFY_WRITE, infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user(info.cause, &infop->si_code, Efault); unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); user_access_end(); return err; Efault: user_access_end(); return -EFAULT; } #endif __weak void abort(void) { BUG(); /* if that doesn't kill us, halt */ panic("Oops failed to kill thread"); } EXPORT_SYMBOL(abort); |
150 150 31 150 150 150 147 60 60 185 185 169 5 5 5 69 69 69 69 69 69 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 | /* * PCM Interface - misc routines * Copyright (c) 1998 by Jaroslav Kysela <perex@perex.cz> * * * This library is free software; you can redistribute it and/or modify * it under the terms of the GNU Library General Public License as * published by the Free Software Foundation; either version 2 of * the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Library General Public License for more details. * * You should have received a copy of the GNU Library General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ #include <linux/time.h> #include <linux/export.h> #include <sound/core.h> #include <sound/pcm.h> #include "pcm_local.h" #define SND_PCM_FORMAT_UNKNOWN (-1) /* NOTE: "signed" prefix must be given below since the default char is * unsigned on some architectures! */ struct pcm_format_data { unsigned char width; /* bit width */ unsigned char phys; /* physical bit width */ signed char le; /* 0 = big-endian, 1 = little-endian, -1 = others */ signed char signd; /* 0 = unsigned, 1 = signed, -1 = others */ unsigned char silence[8]; /* silence data to fill */ }; /* we do lots of calculations on snd_pcm_format_t; shut up sparse */ #define INT __force int static struct pcm_format_data pcm_formats[(INT)SNDRV_PCM_FORMAT_LAST+1] = { [SNDRV_PCM_FORMAT_S8] = { .width = 8, .phys = 8, .le = -1, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_U8] = { .width = 8, .phys = 8, .le = -1, .signd = 0, .silence = { 0x80 }, }, [SNDRV_PCM_FORMAT_S16_LE] = { .width = 16, .phys = 16, .le = 1, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_S16_BE] = { .width = 16, .phys = 16, .le = 0, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_U16_LE] = { .width = 16, .phys = 16, .le = 1, .signd = 0, .silence = { 0x00, 0x80 }, }, [SNDRV_PCM_FORMAT_U16_BE] = { .width = 16, .phys = 16, .le = 0, .signd = 0, .silence = { 0x80, 0x00 }, }, [SNDRV_PCM_FORMAT_S24_LE] = { .width = 24, .phys = 32, .le = 1, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_S24_BE] = { .width = 24, .phys = 32, .le = 0, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_U24_LE] = { .width = 24, .phys = 32, .le = 1, .signd = 0, .silence = { 0x00, 0x00, 0x80 }, }, [SNDRV_PCM_FORMAT_U24_BE] = { .width = 24, .phys = 32, .le = 0, .signd = 0, .silence = { 0x00, 0x80, 0x00, 0x00 }, }, [SNDRV_PCM_FORMAT_S32_LE] = { .width = 32, .phys = 32, .le = 1, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_S32_BE] = { .width = 32, .phys = 32, .le = 0, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_U32_LE] = { .width = 32, .phys = 32, .le = 1, .signd = 0, .silence = { 0x00, 0x00, 0x00, 0x80 }, }, [SNDRV_PCM_FORMAT_U32_BE] = { .width = 32, .phys = 32, .le = 0, .signd = 0, .silence = { 0x80, 0x00, 0x00, 0x00 }, }, [SNDRV_PCM_FORMAT_FLOAT_LE] = { .width = 32, .phys = 32, .le = 1, .signd = -1, .silence = {}, }, [SNDRV_PCM_FORMAT_FLOAT_BE] = { .width = 32, .phys = 32, .le = 0, .signd = -1, .silence = {}, }, [SNDRV_PCM_FORMAT_FLOAT64_LE] = { .width = 64, .phys = 64, .le = 1, .signd = -1, .silence = {}, }, [SNDRV_PCM_FORMAT_FLOAT64_BE] = { .width = 64, .phys = 64, .le = 0, .signd = -1, .silence = {}, }, [SNDRV_PCM_FORMAT_IEC958_SUBFRAME_LE] = { .width = 32, .phys = 32, .le = 1, .signd = -1, .silence = {}, }, [SNDRV_PCM_FORMAT_IEC958_SUBFRAME_BE] = { .width = 32, .phys = 32, .le = 0, .signd = -1, .silence = {}, }, [SNDRV_PCM_FORMAT_MU_LAW] = { .width = 8, .phys = 8, .le = -1, .signd = -1, .silence = { 0x7f }, }, [SNDRV_PCM_FORMAT_A_LAW] = { .width = 8, .phys = 8, .le = -1, .signd = -1, .silence = { 0x55 }, }, [SNDRV_PCM_FORMAT_IMA_ADPCM] = { .width = 4, .phys = 4, .le = -1, .signd = -1, .silence = {}, }, [SNDRV_PCM_FORMAT_G723_24] = { .width = 3, .phys = 3, .le = -1, .signd = -1, .silence = {}, }, [SNDRV_PCM_FORMAT_G723_40] = { .width = 5, .phys = 5, .le = -1, .signd = -1, .silence = {}, }, [SNDRV_PCM_FORMAT_DSD_U8] = { .width = 8, .phys = 8, .le = 1, .signd = 0, .silence = { 0x69 }, }, [SNDRV_PCM_FORMAT_DSD_U16_LE] = { .width = 16, .phys = 16, .le = 1, .signd = 0, .silence = { 0x69, 0x69 }, }, [SNDRV_PCM_FORMAT_DSD_U32_LE] = { .width = 32, .phys = 32, .le = 1, .signd = 0, .silence = { 0x69, 0x69, 0x69, 0x69 }, }, [SNDRV_PCM_FORMAT_DSD_U16_BE] = { .width = 16, .phys = 16, .le = 0, .signd = 0, .silence = { 0x69, 0x69 }, }, [SNDRV_PCM_FORMAT_DSD_U32_BE] = { .width = 32, .phys = 32, .le = 0, .signd = 0, .silence = { 0x69, 0x69, 0x69, 0x69 }, }, /* FIXME: the following two formats are not defined properly yet */ [SNDRV_PCM_FORMAT_MPEG] = { .le = -1, .signd = -1, }, [SNDRV_PCM_FORMAT_GSM] = { .le = -1, .signd = -1, }, [SNDRV_PCM_FORMAT_S20_LE] = { .width = 20, .phys = 32, .le = 1, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_S20_BE] = { .width = 20, .phys = 32, .le = 0, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_U20_LE] = { .width = 20, .phys = 32, .le = 1, .signd = 0, .silence = { 0x00, 0x00, 0x08, 0x00 }, }, [SNDRV_PCM_FORMAT_U20_BE] = { .width = 20, .phys = 32, .le = 0, .signd = 0, .silence = { 0x00, 0x08, 0x00, 0x00 }, }, /* FIXME: the following format is not defined properly yet */ [SNDRV_PCM_FORMAT_SPECIAL] = { .le = -1, .signd = -1, }, [SNDRV_PCM_FORMAT_S24_3LE] = { .width = 24, .phys = 24, .le = 1, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_S24_3BE] = { .width = 24, .phys = 24, .le = 0, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_U24_3LE] = { .width = 24, .phys = 24, .le = 1, .signd = 0, .silence = { 0x00, 0x00, 0x80 }, }, [SNDRV_PCM_FORMAT_U24_3BE] = { .width = 24, .phys = 24, .le = 0, .signd = 0, .silence = { 0x80, 0x00, 0x00 }, }, [SNDRV_PCM_FORMAT_S20_3LE] = { .width = 20, .phys = 24, .le = 1, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_S20_3BE] = { .width = 20, .phys = 24, .le = 0, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_U20_3LE] = { .width = 20, .phys = 24, .le = 1, .signd = 0, .silence = { 0x00, 0x00, 0x08 }, }, [SNDRV_PCM_FORMAT_U20_3BE] = { .width = 20, .phys = 24, .le = 0, .signd = 0, .silence = { 0x08, 0x00, 0x00 }, }, [SNDRV_PCM_FORMAT_S18_3LE] = { .width = 18, .phys = 24, .le = 1, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_S18_3BE] = { .width = 18, .phys = 24, .le = 0, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_U18_3LE] = { .width = 18, .phys = 24, .le = 1, .signd = 0, .silence = { 0x00, 0x00, 0x02 }, }, [SNDRV_PCM_FORMAT_U18_3BE] = { .width = 18, .phys = 24, .le = 0, .signd = 0, .silence = { 0x02, 0x00, 0x00 }, }, [SNDRV_PCM_FORMAT_G723_24_1B] = { .width = 3, .phys = 8, .le = -1, .signd = -1, .silence = {}, }, [SNDRV_PCM_FORMAT_G723_40_1B] = { .width = 5, .phys = 8, .le = -1, .signd = -1, .silence = {}, }, }; /** * snd_pcm_format_signed - Check the PCM format is signed linear * @format: the format to check * * Return: 1 if the given PCM format is signed linear, 0 if unsigned * linear, and a negative error code for non-linear formats. */ int snd_pcm_format_signed(snd_pcm_format_t format) { int val; if ((INT)format < 0 || (INT)format > (INT)SNDRV_PCM_FORMAT_LAST) return -EINVAL; if ((val = pcm_formats[(INT)format].signd) < 0) return -EINVAL; return val; } EXPORT_SYMBOL(snd_pcm_format_signed); /** * snd_pcm_format_unsigned - Check the PCM format is unsigned linear * @format: the format to check * * Return: 1 if the given PCM format is unsigned linear, 0 if signed * linear, and a negative error code for non-linear formats. */ int snd_pcm_format_unsigned(snd_pcm_format_t format) { int val; val = snd_pcm_format_signed(format); if (val < 0) return val; return !val; } EXPORT_SYMBOL(snd_pcm_format_unsigned); /** * snd_pcm_format_linear - Check the PCM format is linear * @format: the format to check * * Return: 1 if the given PCM format is linear, 0 if not. */ int snd_pcm_format_linear(snd_pcm_format_t format) { return snd_pcm_format_signed(format) >= 0; } EXPORT_SYMBOL(snd_pcm_format_linear); /** * snd_pcm_format_little_endian - Check the PCM format is little-endian * @format: the format to check * * Return: 1 if the given PCM format is little-endian, 0 if * big-endian, or a negative error code if endian not specified. */ int snd_pcm_format_little_endian(snd_pcm_format_t format) { int val; if ((INT)format < 0 || (INT)format > (INT)SNDRV_PCM_FORMAT_LAST) return -EINVAL; if ((val = pcm_formats[(INT)format].le) < 0) return -EINVAL; return val; } EXPORT_SYMBOL(snd_pcm_format_little_endian); /** * snd_pcm_format_big_endian - Check the PCM format is big-endian * @format: the format to check * * Return: 1 if the given PCM format is big-endian, 0 if * little-endian, or a negative error code if endian not specified. */ int snd_pcm_format_big_endian(snd_pcm_format_t format) { int val; val = snd_pcm_format_little_endian(format); if (val < 0) return val; return !val; } EXPORT_SYMBOL(snd_pcm_format_big_endian); /** * snd_pcm_format_width - return the bit-width of the format * @format: the format to check * * Return: The bit-width of the format, or a negative error code * if unknown format. */ int snd_pcm_format_width(snd_pcm_format_t format) { int val; if ((INT)format < 0 || (INT)format > (INT)SNDRV_PCM_FORMAT_LAST) return -EINVAL; if ((val = pcm_formats[(INT)format].width) == 0) return -EINVAL; return val; } EXPORT_SYMBOL(snd_pcm_format_width); /** * snd_pcm_format_physical_width - return the physical bit-width of the format * @format: the format to check * * Return: The physical bit-width of the format, or a negative error code * if unknown format. */ int snd_pcm_format_physical_width(snd_pcm_format_t format) { int val; if ((INT)format < 0 || (INT)format > (INT)SNDRV_PCM_FORMAT_LAST) return -EINVAL; if ((val = pcm_formats[(INT)format].phys) == 0) return -EINVAL; return val; } EXPORT_SYMBOL(snd_pcm_format_physical_width); /** * snd_pcm_format_size - return the byte size of samples on the given format * @format: the format to check * @samples: sampling rate * * Return: The byte size of the given samples for the format, or a * negative error code if unknown format. */ ssize_t snd_pcm_format_size(snd_pcm_format_t format, size_t samples) { int phys_width = snd_pcm_format_physical_width(format); if (phys_width < 0) return -EINVAL; return samples * phys_width / 8; } EXPORT_SYMBOL(snd_pcm_format_size); /** * snd_pcm_format_silence_64 - return the silent data in 8 bytes array * @format: the format to check * * Return: The format pattern to fill or %NULL if error. */ const unsigned char *snd_pcm_format_silence_64(snd_pcm_format_t format) { if ((INT)format < 0 || (INT)format > (INT)SNDRV_PCM_FORMAT_LAST) return NULL; if (! pcm_formats[(INT)format].phys) return NULL; return pcm_formats[(INT)format].silence; } EXPORT_SYMBOL(snd_pcm_format_silence_64); /** * snd_pcm_format_set_silence - set the silence data on the buffer * @format: the PCM format * @data: the buffer pointer * @samples: the number of samples to set silence * * Sets the silence data on the buffer for the given samples. * * Return: Zero if successful, or a negative error code on failure. */ int snd_pcm_format_set_silence(snd_pcm_format_t format, void *data, unsigned int samples) { int width; unsigned char *dst, *pat; if ((INT)format < 0 || (INT)format > (INT)SNDRV_PCM_FORMAT_LAST) return -EINVAL; if (samples == 0) return 0; width = pcm_formats[(INT)format].phys; /* physical width */ pat = pcm_formats[(INT)format].silence; if (!width || !pat) return -EINVAL; /* signed or 1 byte data */ if (pcm_formats[(INT)format].signd == 1 || width <= 8) { unsigned int bytes = samples * width / 8; memset(data, *pat, bytes); return 0; } /* non-zero samples, fill using a loop */ width /= 8; dst = data; #if 0 while (samples--) { memcpy(dst, pat, width); dst += width; } #else /* a bit optimization for constant width */ switch (width) { case 2: while (samples--) { memcpy(dst, pat, 2); dst += 2; } break; case 3: while (samples--) { memcpy(dst, pat, 3); dst += 3; } break; case 4: while (samples--) { memcpy(dst, pat, 4); dst += 4; } break; case 8: while (samples--) { memcpy(dst, pat, 8); dst += 8; } break; } #endif return 0; } EXPORT_SYMBOL(snd_pcm_format_set_silence); /** * snd_pcm_limit_hw_rates - determine rate_min/rate_max fields * @runtime: the runtime instance * * Determines the rate_min and rate_max fields from the rates bits of * the given runtime->hw. * * Return: Zero if successful. */ int snd_pcm_limit_hw_rates(struct snd_pcm_runtime *runtime) { int i; for (i = 0; i < (int)snd_pcm_known_rates.count; i++) { if (runtime->hw.rates & (1 << i)) { runtime->hw.rate_min = snd_pcm_known_rates.list[i]; break; } } for (i = (int)snd_pcm_known_rates.count - 1; i >= 0; i--) { if (runtime->hw.rates & (1 << i)) { runtime->hw.rate_max = snd_pcm_known_rates.list[i]; break; } } return 0; } EXPORT_SYMBOL(snd_pcm_limit_hw_rates); /** * snd_pcm_rate_to_rate_bit - converts sample rate to SNDRV_PCM_RATE_xxx bit * @rate: the sample rate to convert * * Return: The SNDRV_PCM_RATE_xxx flag that corresponds to the given rate, or * SNDRV_PCM_RATE_KNOT for an unknown rate. */ unsigned int snd_pcm_rate_to_rate_bit(unsigned int rate) { unsigned int i; for (i = 0; i < snd_pcm_known_rates.count; i++) if (snd_pcm_known_rates.list[i] == rate) return 1u << i; return SNDRV_PCM_RATE_KNOT; } EXPORT_SYMBOL(snd_pcm_rate_to_rate_bit); /** * snd_pcm_rate_bit_to_rate - converts SNDRV_PCM_RATE_xxx bit to sample rate * @rate_bit: the rate bit to convert * * Return: The sample rate that corresponds to the given SNDRV_PCM_RATE_xxx flag * or 0 for an unknown rate bit. */ unsigned int snd_pcm_rate_bit_to_rate(unsigned int rate_bit) { unsigned int i; for (i = 0; i < snd_pcm_known_rates.count; i++) if ((1u << i) == rate_bit) return snd_pcm_known_rates.list[i]; return 0; } EXPORT_SYMBOL(snd_pcm_rate_bit_to_rate); static unsigned int snd_pcm_rate_mask_sanitize(unsigned int rates) { if (rates & SNDRV_PCM_RATE_CONTINUOUS) return SNDRV_PCM_RATE_CONTINUOUS; else if (rates & SNDRV_PCM_RATE_KNOT) return SNDRV_PCM_RATE_KNOT; return rates; } /** * snd_pcm_rate_mask_intersect - computes the intersection between two rate masks * @rates_a: The first rate mask * @rates_b: The second rate mask * * This function computes the rates that are supported by both rate masks passed * to the function. It will take care of the special handling of * SNDRV_PCM_RATE_CONTINUOUS and SNDRV_PCM_RATE_KNOT. * * Return: A rate mask containing the rates that are supported by both rates_a * and rates_b. */ unsigned int snd_pcm_rate_mask_intersect(unsigned int rates_a, unsigned int rates_b) { rates_a = snd_pcm_rate_mask_sanitize(rates_a); rates_b = snd_pcm_rate_mask_sanitize(rates_b); if (rates_a & SNDRV_PCM_RATE_CONTINUOUS) return rates_b; else if (rates_b & SNDRV_PCM_RATE_CONTINUOUS) return rates_a; else if (rates_a & SNDRV_PCM_RATE_KNOT) return rates_b; else if (rates_b & SNDRV_PCM_RATE_KNOT) return rates_a; return rates_a & rates_b; } EXPORT_SYMBOL_GPL(snd_pcm_rate_mask_intersect); /** * snd_pcm_rate_range_to_bits - converts rate range to SNDRV_PCM_RATE_xxx bit * @rate_min: the minimum sample rate * @rate_max: the maximum sample rate * * This function has an implicit assumption: the rates in the given range have * only the pre-defined rates like 44100 or 16000. * * Return: The SNDRV_PCM_RATE_xxx flag that corresponds to the given rate range, * or SNDRV_PCM_RATE_KNOT for an unknown range. */ unsigned int snd_pcm_rate_range_to_bits(unsigned int rate_min, unsigned int rate_max) { unsigned int rates = 0; int i; for (i = 0; i < snd_pcm_known_rates.count; i++) { if (snd_pcm_known_rates.list[i] >= rate_min && snd_pcm_known_rates.list[i] <= rate_max) rates |= 1 << i; } if (!rates) rates = SNDRV_PCM_RATE_KNOT; return rates; } EXPORT_SYMBOL_GPL(snd_pcm_rate_range_to_bits); |
8087 8092 142 8024 19 19 2 8024 52 1 52 10 8015 5858 2016 2016 3 3 2018 8088 8086 8086 17 8079 147 140 24 125 124 22 8558 8470 239 26 213 138 8556 26 1905 830 753 1114 285 911 1554 8555 8567 4252 1633 8318 4658 5 214 1517 3380 8021 8567 4223 8561 8569 5769 8196 102 8130 8031 1539 15 15 15 253 2 890 15 15 8032 438 8031 439 7971 255 255 8029 8003 88 85 88 88 7995 8007 8009 731 13 8011 7547 7546 7544 7546 483 7549 2 77 77 697 697 690 690 690 666 21 21 21 21 3918 87 7618 6743 6740 6741 7620 7617 1845 41 2 3407 6700 3496 178 7226 2627 3433 3340 151 3436 3435 800 800 588 86 86 621 604 4 3 1 1 665 7057 422 840 159 157 890 890 653 597 9 428 884 128 128 128 128 884 26 4 24 24 7 7 6 6 6 257 2595 257 15 15 14 252 246 179 24 179 177 106 251 251 245 179 2586 113 2 111 112 111 5781 18 5784 4 5704 3842 3818 256 255 182 180 178 178 42 42 40 4 13 13 10 182 182 240 166 166 166 55 65 70 74 74 74 2526 19 19 19 1458 1146 1456 1143 1144 28 6375 6250 994 6240 5811 5812 703 5786 697 690 685 1210 473 860 13 13 13 28 1031 6375 2526 2529 14 2 2 2518 2526 73 256 8025 426 426 6067 207 93 207 182 74 890 890 422 422 890 890 6376 6375 1059 7508 891 892 7506 5727 5748 1452 1353 1303 1298 778 5307 655 4733 4738 1301 4575 173 109 173 1499 8021 5795 8088 3328 8087 8024 8020 5755 211 7833 7829 364 366 8022 7608 7291 7288 7814 798 594 7243 7243 658 7056 6 5 8085 8127 70 8072 8130 39 39 11 8133 8110 3325 6371 6161 6158 388 304 286 246 41 589 588 3895 37 3849 6 6 3920 3918 90 3829 3693 3694 620 3690 3909 3804 11 56 3798 3797 3796 3823 1508 1510 1468 1468 1510 12 1508 1509 1467 76 1909 12 1444 1444 1444 8 1439 1439 1440 1439 1439 1403 1402 7 1395 1341 280 280 55 52 55 3637 134 135 135 2 7 132 12 14 123 137 137 11 5 118 136 2 136 136 136 138 4 21 3 2 577 577 577 577 575 335 566 567 577 1267 1267 1267 1266 617 172 466 25 196 451 66 66 433 433 438 460 88 87 87 89 41 41 41 41 2368 2368 4169 4170 611 4169 1258 4167 4160 5 4 4159 596 594 593 1268 1268 1267 1265 1 39 1 39 39 36 22 22 22 22 19 17 2408 2405 1979 25 5 5 2405 2017 1278 335 1280 16 1268 3 1266 810 2007 39 7 1946 1943 2 1945 1181 1180 1178 132 2293 2379 84 4376 4 4275 3035 8 3034 846 812 813 1681 1681 2406 2018 1679 371 1680 2380 2307 2308 1134 1398 996 1400 1398 1303 1291 2440 517 3461 3460 3463 3448 3118 1792 602 4160 4148 4150 3984 3951 1003 4187 1586 386 386 385 364 364 357 357 24 2 17 14 13 11 11 11 13 104 56 121 4688 4687 4672 4604 4601 504 1384 1490 3961 4578 925 4661 31 4551 4552 29 2 28 28 28 29 972 1080 1019 1017 301 968 971 1077 60 73 70 38 924 71 70 69 69 72 83 27 82 83 43 33 20 20 11 54 55 938 936 935 935 940 738 138 738 737 799 369 487 33 204 204 191 191 174 189 32 174 210 209 209 206 197 197 202 204 3 206 152 142 140 1 141 151 174 174 170 167 167 160 156 146 146 159 143 165 165 165 167 172 8 24 23 7 158 165 164 164 166 129 129 136 137 49 89 127 124 125 125 125 124 50 124 1 124 120 126 102 101 2 100 1 100 93 88 86 86 86 11 91 303 290 133 291 287 288 173 171 287 287 287 193 144 284 9 6 279 1 284 1 284 258 103 41 258 246 30 281 185 144 144 257 258 258 30 281 297 280 278 10 277 273 263 262 260 260 259 237 235 234 6 128 127 231 105 225 226 225 226 245 247 247 246 269 70 7 207 30 30 4 6 6 7 1 6 5 6 70 70 57 57 33 33 33 32 32 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/namei.c * * Copyright (C) 1991, 1992 Linus Torvalds */ /* * Some corrections by tytso. */ /* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname * lookup logic. */ /* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture. */ #include <linux/init.h> #include <linux/export.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/fs.h> #include <linux/namei.h> #include <linux/pagemap.h> #include <linux/fsnotify.h> #include <linux/personality.h> #include <linux/security.h> #include <linux/ima.h> #include <linux/syscalls.h> #include <linux/mount.h> #include <linux/audit.h> #include <linux/capability.h> #include <linux/file.h> #include <linux/fcntl.h> #include <linux/device_cgroup.h> #include <linux/fs_struct.h> #include <linux/posix_acl.h> #include <linux/hash.h> #include <linux/bitops.h> #include <linux/init_task.h> #include <linux/uaccess.h> #include <linux/build_bug.h> #include "internal.h" #include "mount.h" /* [Feb-1997 T. Schoebel-Theuer] * Fundamental changes in the pathname lookup mechanisms (namei) * were necessary because of omirr. The reason is that omirr needs * to know the _real_ pathname, not the user-supplied one, in case * of symlinks (and also when transname replacements occur). * * The new code replaces the old recursive symlink resolution with * an iterative one (in case of non-nested symlink chains). It does * this with calls to <fs>_follow_link(). * As a side effect, dir_namei(), _namei() and follow_link() are now * replaced with a single function lookup_dentry() that can handle all * the special cases of the former code. * * With the new dcache, the pathname is stored at each inode, at least as * long as the refcount of the inode is positive. As a side effect, the * size of the dcache depends on the inode cache and thus is dynamic. * * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink * resolution to correspond with current state of the code. * * Note that the symlink resolution is not *completely* iterative. * There is still a significant amount of tail- and mid- recursion in * the algorithm. Also, note that <fs>_readlink() is not used in * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink() * may return different results than <fs>_follow_link(). Many virtual * filesystems (including /proc) exhibit this behavior. */ /* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation: * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL * and the name already exists in form of a symlink, try to create the new * name indicated by the symlink. The old code always complained that the * name already exists, due to not following the symlink even if its target * is nonexistent. The new semantics affects also mknod() and link() when * the name is a symlink pointing to a non-existent name. * * I don't know which semantics is the right one, since I have no access * to standards. But I found by trial that HP-UX 9.0 has the full "new" * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the * "old" one. Personally, I think the new semantics is much more logical. * Note that "ln old new" where "new" is a symlink pointing to a non-existing * file does succeed in both HP-UX and SunOs, but not in Solaris * and in the old Linux semantics. */ /* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink * semantics. See the comments in "open_namei" and "do_link" below. * * [10-Sep-98 Alan Modra] Another symlink change. */ /* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks: * inside the path - always follow. * in the last component in creation/removal/renaming - never follow. * if LOOKUP_FOLLOW passed - follow. * if the pathname has trailing slashes - follow. * otherwise - don't follow. * (applied in that order). * * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT * restored for 2.4. This is the last surviving part of old 4.2BSD bug. * During the 2.4 we need to fix the userland stuff depending on it - * hopefully we will be able to get rid of that wart in 2.5. So far only * XEmacs seems to be relying on it... */ /* * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland) * implemented. Let's see if raised priority of ->s_vfs_rename_mutex gives * any extra contention... */ /* In order to reduce some races, while at the same time doing additional * checking and hopefully speeding things up, we copy filenames to the * kernel data space before using them.. * * POSIX.1 2.4: an empty pathname is invalid (ENOENT). * PATH_MAX includes the nul terminator --RR. */ #define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname)) struct filename * getname_flags(const char __user *filename, int flags, int *empty) { struct filename *result; char *kname; int len; BUILD_BUG_ON(offsetof(struct filename, iname) % sizeof(long) != 0); result = audit_reusename(filename); if (result) return result; result = __getname(); if (unlikely(!result)) return ERR_PTR(-ENOMEM); /* * First, try to embed the struct filename inside the names_cache * allocation */ kname = (char *)result->iname; result->name = kname; len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX); if (unlikely(len < 0)) { __putname(result); return ERR_PTR(len); } /* * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a * separate struct filename so we can dedicate the entire * names_cache allocation for the pathname, and re-do the copy from * userland. */ if (unlikely(len == EMBEDDED_NAME_MAX)) { const size_t size = offsetof(struct filename, iname[1]); kname = (char *)result; /* * size is chosen that way we to guarantee that * result->iname[0] is within the same object and that * kname can't be equal to result->iname, no matter what. */ result = kzalloc(size, GFP_KERNEL); if (unlikely(!result)) { __putname(kname); return ERR_PTR(-ENOMEM); } result->name = kname; len = strncpy_from_user(kname, filename, PATH_MAX); if (unlikely(len < 0)) { __putname(kname); kfree(result); return ERR_PTR(len); } if (unlikely(len == PATH_MAX)) { __putname(kname); kfree(result); return ERR_PTR(-ENAMETOOLONG); } } result->refcnt = 1; /* The empty path is special. */ if (unlikely(!len)) { if (empty) *empty = 1; if (!(flags & LOOKUP_EMPTY)) { putname(result); return ERR_PTR(-ENOENT); } } result->uptr = filename; result->aname = NULL; audit_getname(result); return result; } struct filename * getname(const char __user * filename) { return getname_flags(filename, 0, NULL); } struct filename * getname_kernel(const char * filename) { struct filename *result; int len = strlen(filename) + 1; result = __getname(); if (unlikely(!result)) return ERR_PTR(-ENOMEM); if (len <= EMBEDDED_NAME_MAX) { result->name = (char *)result->iname; } else if (len <= PATH_MAX) { const size_t size = offsetof(struct filename, iname[1]); struct filename *tmp; tmp = kmalloc(size, GFP_KERNEL); if (unlikely(!tmp)) { __putname(result); return ERR_PTR(-ENOMEM); } tmp->name = (char *)result; result = tmp; } else { __putname(result); return ERR_PTR(-ENAMETOOLONG); } memcpy((char *)result->name, filename, len); result->uptr = NULL; result->aname = NULL; result->refcnt = 1; audit_getname(result); return result; } void putname(struct filename *name) { BUG_ON(name->refcnt <= 0); if (--name->refcnt > 0) return; if (name->name != name->iname) { __putname(name->name); kfree(name); } else __putname(name); } static int check_acl(struct inode *inode, int mask) { #ifdef CONFIG_FS_POSIX_ACL struct posix_acl *acl; if (mask & MAY_NOT_BLOCK) { acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS); if (!acl) return -EAGAIN; /* no ->get_acl() calls in RCU mode... */ if (is_uncached_acl(acl)) return -ECHILD; return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK); } acl = get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl) { int error = posix_acl_permission(inode, acl, mask); posix_acl_release(acl); return error; } #endif return -EAGAIN; } /* * This does the basic permission checking */ static int acl_permission_check(struct inode *inode, int mask) { unsigned int mode = inode->i_mode; if (likely(uid_eq(current_fsuid(), inode->i_uid))) mode >>= 6; else { if (IS_POSIXACL(inode) && (mode & S_IRWXG)) { int error = check_acl(inode, mask); if (error != -EAGAIN) return error; } if (in_group_p(inode->i_gid)) mode >>= 3; } /* * If the DACs are ok we don't need any capability check. */ if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) return 0; return -EACCES; } /** * generic_permission - check for access rights on a Posix-like filesystem * @inode: inode to check access rights for * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...) * * Used to check for read/write/execute permissions on a file. * We use "fsuid" for this, letting us set arbitrary permissions * for filesystem access without changing the "normal" uids which * are used for other things. * * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk * request cannot be satisfied (eg. requires blocking or too much complexity). * It would then be called again in ref-walk mode. */ int generic_permission(struct inode *inode, int mask) { int ret; /* * Do the basic permission checks. */ ret = acl_permission_check(inode, mask); if (ret != -EACCES) return ret; if (S_ISDIR(inode->i_mode)) { /* DACs are overridable for directories */ if (!(mask & MAY_WRITE)) if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH)) return 0; if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE)) return 0; return -EACCES; } /* * Searching includes executable on directories, else just read. */ mask &= MAY_READ | MAY_WRITE | MAY_EXEC; if (mask == MAY_READ) if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH)) return 0; /* * Read/write DACs are always overridable. * Executable DACs are overridable when there is * at least one exec bit set. */ if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE)) return 0; return -EACCES; } EXPORT_SYMBOL(generic_permission); /* * We _really_ want to just do "generic_permission()" without * even looking at the inode->i_op values. So we keep a cache * flag in inode->i_opflags, that says "this has not special * permission function, use the fast case". */ static inline int do_inode_permission(struct inode *inode, int mask) { if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) { if (likely(inode->i_op->permission)) return inode->i_op->permission(inode, mask); /* This gets set once for the inode lifetime */ spin_lock(&inode->i_lock); inode->i_opflags |= IOP_FASTPERM; spin_unlock(&inode->i_lock); } return generic_permission(inode, mask); } /** * sb_permission - Check superblock-level permissions * @sb: Superblock of inode to check permission on * @inode: Inode to check permission on * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) * * Separate out file-system wide checks from inode-specific permission checks. */ static int sb_permission(struct super_block *sb, struct inode *inode, int mask) { if (unlikely(mask & MAY_WRITE)) { umode_t mode = inode->i_mode; /* Nobody gets write access to a read-only fs. */ if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) return -EROFS; } return 0; } /** * inode_permission - Check for access rights to a given inode * @inode: Inode to check permission on * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) * * Check for read/write/execute permissions on an inode. We use fs[ug]id for * this, letting us set arbitrary permissions for filesystem access without * changing the "normal" UIDs which are used for other things. * * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask. */ int inode_permission(struct inode *inode, int mask) { int retval; retval = sb_permission(inode->i_sb, inode, mask); if (retval) return retval; if (unlikely(mask & MAY_WRITE)) { /* * Nobody gets write access to an immutable file. */ if (IS_IMMUTABLE(inode)) return -EPERM; /* * Updating mtime will likely cause i_uid and i_gid to be * written back improperly if their true value is unknown * to the vfs. */ if (HAS_UNMAPPED_ID(inode)) return -EACCES; } retval = do_inode_permission(inode, mask); if (retval) return retval; retval = devcgroup_inode_permission(inode, mask); if (retval) return retval; return security_inode_permission(inode, mask); } EXPORT_SYMBOL(inode_permission); /** * path_get - get a reference to a path * @path: path to get the reference to * * Given a path increment the reference count to the dentry and the vfsmount. */ void path_get(const struct path *path) { mntget(path->mnt); dget(path->dentry); } EXPORT_SYMBOL(path_get); /** * path_put - put a reference to a path * @path: path to put the reference to * * Given a path decrement the reference count to the dentry and the vfsmount. */ void path_put(const struct path *path) { dput(path->dentry); mntput(path->mnt); } EXPORT_SYMBOL(path_put); #define EMBEDDED_LEVELS 2 struct nameidata { struct path path; struct qstr last; struct path root; struct inode *inode; /* path.dentry.d_inode */ unsigned int flags; unsigned seq, m_seq; int last_type; unsigned depth; int total_link_count; struct saved { struct path link; struct delayed_call done; const char *name; unsigned seq; } *stack, internal[EMBEDDED_LEVELS]; struct filename *name; struct nameidata *saved; struct inode *link_inode; unsigned root_seq; int dfd; } __randomize_layout; static void set_nameidata(struct nameidata *p, int dfd, struct filename *name) { struct nameidata *old = current->nameidata; p->stack = p->internal; p->dfd = dfd; p->name = name; p->total_link_count = old ? old->total_link_count : 0; p->saved = old; current->nameidata = p; } static void restore_nameidata(void) { struct nameidata *now = current->nameidata, *old = now->saved; current->nameidata = old; if (old) old->total_link_count = now->total_link_count; if (now->stack != now->internal) kfree(now->stack); } static int __nd_alloc_stack(struct nameidata *nd) { struct saved *p; if (nd->flags & LOOKUP_RCU) { p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved), GFP_ATOMIC); if (unlikely(!p)) return -ECHILD; } else { p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved), GFP_KERNEL); if (unlikely(!p)) return -ENOMEM; } memcpy(p, nd->internal, sizeof(nd->internal)); nd->stack = p; return 0; } /** * path_connected - Verify that a path->dentry is below path->mnt.mnt_root * @path: nameidate to verify * * Rename can sometimes move a file or directory outside of a bind * mount, path_connected allows those cases to be detected. */ static bool path_connected(const struct path *path) { struct vfsmount *mnt = path->mnt; struct super_block *sb = mnt->mnt_sb; /* Bind mounts and multi-root filesystems can have disconnected paths */ if (!(sb->s_iflags & SB_I_MULTIROOT) && (mnt->mnt_root == sb->s_root)) return true; return is_subdir(path->dentry, mnt->mnt_root); } static inline int nd_alloc_stack(struct nameidata *nd) { if (likely(nd->depth != EMBEDDED_LEVELS)) return 0; if (likely(nd->stack != nd->internal)) return 0; return __nd_alloc_stack(nd); } static void drop_links(struct nameidata *nd) { int i = nd->depth; while (i--) { struct saved *last = nd->stack + i; do_delayed_call(&last->done); clear_delayed_call(&last->done); } } static void terminate_walk(struct nameidata *nd) { drop_links(nd); if (!(nd->flags & LOOKUP_RCU)) { int i; path_put(&nd->path); for (i = 0; i < nd->depth; i++) path_put(&nd->stack[i].link); if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { path_put(&nd->root); nd->root.mnt = NULL; } } else { nd->flags &= ~LOOKUP_RCU; if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; rcu_read_unlock(); } nd->depth = 0; } /* path_put is needed afterwards regardless of success or failure */ static bool legitimize_path(struct nameidata *nd, struct path *path, unsigned seq) { int res = __legitimize_mnt(path->mnt, nd->m_seq); if (unlikely(res)) { if (res > 0) path->mnt = NULL; path->dentry = NULL; return false; } if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) { path->dentry = NULL; return false; } return !read_seqcount_retry(&path->dentry->d_seq, seq); } static bool legitimize_links(struct nameidata *nd) { int i; for (i = 0; i < nd->depth; i++) { struct saved *last = nd->stack + i; if (unlikely(!legitimize_path(nd, &last->link, last->seq))) { drop_links(nd); nd->depth = i + 1; return false; } } return true; } /* * Path walking has 2 modes, rcu-walk and ref-walk (see * Documentation/filesystems/path-lookup.txt). In situations when we can't * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab * normal reference counts on dentries and vfsmounts to transition to ref-walk * mode. Refcounts are grabbed at the last known good point before rcu-walk * got stuck, so ref-walk may continue from there. If this is not successful * (eg. a seqcount has changed), then failure is returned and it's up to caller * to restart the path walk from the beginning in ref-walk mode. */ /** * unlazy_walk - try to switch to ref-walk mode. * @nd: nameidata pathwalk data * Returns: 0 on success, -ECHILD on failure * * unlazy_walk attempts to legitimize the current nd->path and nd->root * for ref-walk mode. * Must be called from rcu-walk context. * Nothing should touch nameidata between unlazy_walk() failure and * terminate_walk(). */ static int unlazy_walk(struct nameidata *nd) { struct dentry *parent = nd->path.dentry; BUG_ON(!(nd->flags & LOOKUP_RCU)); nd->flags &= ~LOOKUP_RCU; if (unlikely(!legitimize_links(nd))) goto out2; if (unlikely(!legitimize_path(nd, &nd->path, nd->seq))) goto out1; if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq))) goto out; } rcu_read_unlock(); BUG_ON(nd->inode != parent->d_inode); return 0; out2: nd->path.mnt = NULL; nd->path.dentry = NULL; out1: if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; out: rcu_read_unlock(); return -ECHILD; } /** * unlazy_child - try to switch to ref-walk mode. * @nd: nameidata pathwalk data * @dentry: child of nd->path.dentry * @seq: seq number to check dentry against * Returns: 0 on success, -ECHILD on failure * * unlazy_child attempts to legitimize the current nd->path, nd->root and dentry * for ref-walk mode. @dentry must be a path found by a do_lookup call on * @nd. Must be called from rcu-walk context. * Nothing should touch nameidata between unlazy_child() failure and * terminate_walk(). */ static int unlazy_child(struct nameidata *nd, struct dentry *dentry, unsigned seq) { BUG_ON(!(nd->flags & LOOKUP_RCU)); nd->flags &= ~LOOKUP_RCU; if (unlikely(!legitimize_links(nd))) goto out2; if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq))) goto out2; if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref))) goto out1; /* * We need to move both the parent and the dentry from the RCU domain * to be properly refcounted. And the sequence number in the dentry * validates *both* dentry counters, since we checked the sequence * number of the parent after we got the child sequence number. So we * know the parent must still be valid if the child sequence number is */ if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) goto out; if (unlikely(read_seqcount_retry(&dentry->d_seq, seq))) { rcu_read_unlock(); dput(dentry); goto drop_root_mnt; } /* * Sequence counts matched. Now make sure that the root is * still valid and get it if required. */ if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq))) { rcu_read_unlock(); dput(dentry); return -ECHILD; } } rcu_read_unlock(); return 0; out2: nd->path.mnt = NULL; out1: nd->path.dentry = NULL; out: rcu_read_unlock(); drop_root_mnt: if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; return -ECHILD; } static inline int d_revalidate(struct dentry *dentry, unsigned int flags) { if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) return dentry->d_op->d_revalidate(dentry, flags); else return 1; } /** * complete_walk - successful completion of path walk * @nd: pointer nameidata * * If we had been in RCU mode, drop out of it and legitimize nd->path. * Revalidate the final result, unless we'd already done that during * the path walk or the filesystem doesn't ask for it. Return 0 on * success, -error on failure. In case of failure caller does not * need to drop nd->path. */ static int complete_walk(struct nameidata *nd) { struct dentry *dentry = nd->path.dentry; int status; if (nd->flags & LOOKUP_RCU) { if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; if (unlikely(unlazy_walk(nd))) return -ECHILD; } if (likely(!(nd->flags & LOOKUP_JUMPED))) return 0; if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE))) return 0; status = dentry->d_op->d_weak_revalidate(dentry, nd->flags); if (status > 0) return 0; if (!status) status = -ESTALE; return status; } static void set_root(struct nameidata *nd) { struct fs_struct *fs = current->fs; if (nd->flags & LOOKUP_RCU) { unsigned seq; do { seq = read_seqcount_begin(&fs->seq); nd->root = fs->root; nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq); } while (read_seqcount_retry(&fs->seq, seq)); } else { get_fs_root(fs, &nd->root); } } static void path_put_conditional(struct path *path, struct nameidata *nd) { dput(path->dentry); if (path->mnt != nd->path.mnt) mntput(path->mnt); } static inline void path_to_nameidata(const struct path *path, struct nameidata *nd) { if (!(nd->flags & LOOKUP_RCU)) { dput(nd->path.dentry); if (nd->path.mnt != path->mnt) mntput(nd->path.mnt); } nd->path.mnt = path->mnt; nd->path.dentry = path->dentry; } static int nd_jump_root(struct nameidata *nd) { if (nd->flags & LOOKUP_RCU) { struct dentry *d; nd->path = nd->root; d = nd->path.dentry; nd->inode = d->d_inode; nd->seq = nd->root_seq; if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq))) return -ECHILD; } else { path_put(&nd->path); nd->path = nd->root; path_get(&nd->path); nd->inode = nd->path.dentry->d_inode; } nd->flags |= LOOKUP_JUMPED; return 0; } /* * Helper to directly jump to a known parsed path from ->get_link, * caller must have taken a reference to path beforehand. */ void nd_jump_link(struct path *path) { struct nameidata *nd = current->nameidata; path_put(&nd->path); nd->path = *path; nd->inode = nd->path.dentry->d_inode; nd->flags |= LOOKUP_JUMPED; } static inline void put_link(struct nameidata *nd) { struct saved *last = nd->stack + --nd->depth; do_delayed_call(&last->done); if (!(nd->flags & LOOKUP_RCU)) path_put(&last->link); } int sysctl_protected_symlinks __read_mostly = 0; int sysctl_protected_hardlinks __read_mostly = 0; int sysctl_protected_fifos __read_mostly; int sysctl_protected_regular __read_mostly; /** * may_follow_link - Check symlink following for unsafe situations * @nd: nameidata pathwalk data * * In the case of the sysctl_protected_symlinks sysctl being enabled, * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is * in a sticky world-writable directory. This is to protect privileged * processes from failing races against path names that may change out * from under them by way of other users creating malicious symlinks. * It will permit symlinks to be followed only when outside a sticky * world-writable directory, or when the uid of the symlink and follower * match, or when the directory owner matches the symlink's owner. * * Returns 0 if following the symlink is allowed, -ve on error. */ static inline int may_follow_link(struct nameidata *nd) { const struct inode *inode; const struct inode *parent; kuid_t puid; if (!sysctl_protected_symlinks) return 0; /* Allowed if owner and follower match. */ inode = nd->link_inode; if (uid_eq(current_cred()->fsuid, inode->i_uid)) return 0; /* Allowed if parent directory not sticky and world-writable. */ parent = nd->inode; if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH)) return 0; /* Allowed if parent directory and link owner match. */ puid = parent->i_uid; if (uid_valid(puid) && uid_eq(puid, inode->i_uid)) return 0; if (nd->flags & LOOKUP_RCU) return -ECHILD; audit_inode(nd->name, nd->stack[0].link.dentry, 0); audit_log_link_denied("follow_link"); return -EACCES; } /** * safe_hardlink_source - Check for safe hardlink conditions * @inode: the source inode to hardlink from * * Return false if at least one of the following conditions: * - inode is not a regular file * - inode is setuid * - inode is setgid and group-exec * - access failure for read and write * * Otherwise returns true. */ static bool safe_hardlink_source(struct inode *inode) { umode_t mode = inode->i_mode; /* Special files should not get pinned to the filesystem. */ if (!S_ISREG(mode)) return false; /* Setuid files should not get pinned to the filesystem. */ if (mode & S_ISUID) return false; /* Executable setgid files should not get pinned to the filesystem. */ if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) return false; /* Hardlinking to unreadable or unwritable sources is dangerous. */ if (inode_permission(inode, MAY_READ | MAY_WRITE)) return false; return true; } /** * may_linkat - Check permissions for creating a hardlink * @link: the source to hardlink from * * Block hardlink when all of: * - sysctl_protected_hardlinks enabled * - fsuid does not match inode * - hardlink source is unsafe (see safe_hardlink_source() above) * - not CAP_FOWNER in a namespace with the inode owner uid mapped * * Returns 0 if successful, -ve on error. */ static int may_linkat(struct path *link) { struct inode *inode = link->dentry->d_inode; /* Inode writeback is not safe when the uid or gid are invalid. */ if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid)) return -EOVERFLOW; if (!sysctl_protected_hardlinks) return 0; /* Source inode owner (or CAP_FOWNER) can hardlink all they like, * otherwise, it must be a safe source. */ if (safe_hardlink_source(inode) || inode_owner_or_capable(inode)) return 0; audit_log_link_denied("linkat"); return -EPERM; } /** * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory * should be allowed, or not, on files that already * exist. * @dir_mode: mode bits of directory * @dir_uid: owner of directory * @inode: the inode of the file to open * * Block an O_CREAT open of a FIFO (or a regular file) when: * - sysctl_protected_fifos (or sysctl_protected_regular) is enabled * - the file already exists * - we are in a sticky directory * - we don't own the file * - the owner of the directory doesn't own the file * - the directory is world writable * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2 * the directory doesn't have to be world writable: being group writable will * be enough. * * Returns 0 if the open is allowed, -ve on error. */ static int may_create_in_sticky(umode_t dir_mode, kuid_t dir_uid, struct inode * const inode) { if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) || (!sysctl_protected_regular && S_ISREG(inode->i_mode)) || likely(!(dir_mode & S_ISVTX)) || uid_eq(inode->i_uid, dir_uid) || uid_eq(current_fsuid(), inode->i_uid)) return 0; if (likely(dir_mode & 0002) || (dir_mode & 0020 && ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) || (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) { return -EACCES; } return 0; } static __always_inline const char *get_link(struct nameidata *nd) { struct saved *last = nd->stack + nd->depth - 1; struct dentry *dentry = last->link.dentry; struct inode *inode = nd->link_inode; int error; const char *res; if (!(nd->flags & LOOKUP_RCU)) { touch_atime(&last->link); cond_resched(); } else if (atime_needs_update(&last->link, inode)) { if (unlikely(unlazy_walk(nd))) return ERR_PTR(-ECHILD); touch_atime(&last->link); } error = security_inode_follow_link(dentry, inode, nd->flags & LOOKUP_RCU); if (unlikely(error)) return ERR_PTR(error); nd->last_type = LAST_BIND; res = inode->i_link; if (!res) { const char * (*get)(struct dentry *, struct inode *, struct delayed_call *); get = inode->i_op->get_link; if (nd->flags & LOOKUP_RCU) { res = get(NULL, inode, &last->done); if (res == ERR_PTR(-ECHILD)) { if (unlikely(unlazy_walk(nd))) return ERR_PTR(-ECHILD); res = get(dentry, inode, &last->done); } } else { res = get(dentry, inode, &last->done); } if (IS_ERR_OR_NULL(res)) return res; } if (*res == '/') { if (!nd->root.mnt) set_root(nd); if (unlikely(nd_jump_root(nd))) return ERR_PTR(-ECHILD); while (unlikely(*++res == '/')) ; } if (!*res) res = NULL; return res; } /* * follow_up - Find the mountpoint of path's vfsmount * * Given a path, find the mountpoint of its source file system. * Replace @path with the path of the mountpoint in the parent mount. * Up is towards /. * * Return 1 if we went up a level and 0 if we were already at the * root. */ int follow_up(struct path *path) { struct mount *mnt = real_mount(path->mnt); struct mount *parent; struct dentry *mountpoint; read_seqlock_excl(&mount_lock); parent = mnt->mnt_parent; if (parent == mnt) { read_sequnlock_excl(&mount_lock); return 0; } mntget(&parent->mnt); mountpoint = dget(mnt->mnt_mountpoint); read_sequnlock_excl(&mount_lock); dput(path->dentry); path->dentry = mountpoint; mntput(path->mnt); path->mnt = &parent->mnt; return 1; } EXPORT_SYMBOL(follow_up); /* * Perform an automount * - return -EISDIR to tell follow_managed() to stop and return the path we * were called with. */ static int follow_automount(struct path *path, struct nameidata *nd, bool *need_mntput) { struct vfsmount *mnt; int err; if (!path->dentry->d_op || !path->dentry->d_op->d_automount) return -EREMOTE; /* We don't want to mount if someone's just doing a stat - * unless they're stat'ing a directory and appended a '/' to * the name. * * We do, however, want to mount if someone wants to open or * create a file of any type under the mountpoint, wants to * traverse through the mountpoint or wants to open the * mounted directory. Also, autofs may mark negative dentries * as being automount points. These will need the attentions * of the daemon to instantiate them before they can be used. */ if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && path->dentry->d_inode) return -EISDIR; nd->total_link_count++; if (nd->total_link_count >= 40) return -ELOOP; mnt = path->dentry->d_op->d_automount(path); if (IS_ERR(mnt)) { /* * The filesystem is allowed to return -EISDIR here to indicate * it doesn't want to automount. For instance, autofs would do * this so that its userspace daemon can mount on this dentry. * * However, we can only permit this if it's a terminal point in * the path being looked up; if it wasn't then the remainder of * the path is inaccessible and we should say so. */ if (PTR_ERR(mnt) == -EISDIR && (nd->flags & LOOKUP_PARENT)) return -EREMOTE; return PTR_ERR(mnt); } if (!mnt) /* mount collision */ return 0; if (!*need_mntput) { /* lock_mount() may release path->mnt on error */ mntget(path->mnt); *need_mntput = true; } err = finish_automount(mnt, path); switch (err) { case -EBUSY: /* Someone else made a mount here whilst we were busy */ return 0; case 0: path_put(path); path->mnt = mnt; path->dentry = dget(mnt->mnt_root); return 0; default: return err; } } /* * Handle a dentry that is managed in some way. * - Flagged for transit management (autofs) * - Flagged as mountpoint * - Flagged as automount point * * This may only be called in refwalk mode. * * Serialization is taken care of in namespace.c */ static int follow_managed(struct path *path, struct nameidata *nd) { struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */ unsigned managed; bool need_mntput = false; int ret = 0; /* Given that we're not holding a lock here, we retain the value in a * local variable for each dentry as we look at it so that we don't see * the components of that value change under us */ while (managed = READ_ONCE(path->dentry->d_flags), managed &= DCACHE_MANAGED_DENTRY, unlikely(managed != 0)) { /* Allow the filesystem to manage the transit without i_mutex * being held. */ if (managed & DCACHE_MANAGE_TRANSIT) { BUG_ON(!path->dentry->d_op); BUG_ON(!path->dentry->d_op->d_manage); ret = path->dentry->d_op->d_manage(path, false); if (ret < 0) break; } /* Transit to a mounted filesystem. */ if (managed & DCACHE_MOUNTED) { struct vfsmount *mounted = lookup_mnt(path); if (mounted) { dput(path->dentry); if (need_mntput) mntput(path->mnt); path->mnt = mounted; path->dentry = dget(mounted->mnt_root); need_mntput = true; continue; } /* Something is mounted on this dentry in another * namespace and/or whatever was mounted there in this * namespace got unmounted before lookup_mnt() could * get it */ } /* Handle an automount point */ if (managed & DCACHE_NEED_AUTOMOUNT) { ret = follow_automount(path, nd, &need_mntput); if (ret < 0) break; continue; } /* We didn't change the current path point */ break; } if (need_mntput && path->mnt == mnt) mntput(path->mnt); if (ret == -EISDIR || !ret) ret = 1; if (need_mntput) nd->flags |= LOOKUP_JUMPED; if (unlikely(ret < 0)) path_put_conditional(path, nd); return ret; } int follow_down_one(struct path *path) { struct vfsmount *mounted; mounted = lookup_mnt(path); if (mounted) { dput(path->dentry); mntput(path->mnt); path->mnt = mounted; path->dentry = dget(mounted->mnt_root); return 1; } return 0; } EXPORT_SYMBOL(follow_down_one); static inline int managed_dentry_rcu(const struct path *path) { return (path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) ? path->dentry->d_op->d_manage(path, true) : 0; } /* * Try to skip to top of mountpoint pile in rcuwalk mode. Fail if * we meet a managed dentry that would need blocking. */ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, struct inode **inode, unsigned *seqp) { for (;;) { struct mount *mounted; /* * Don't forget we might have a non-mountpoint managed dentry * that wants to block transit. */ switch (managed_dentry_rcu(path)) { case -ECHILD: default: return false; case -EISDIR: return true; case 0: break; } if (!d_mountpoint(path->dentry)) return !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT); mounted = __lookup_mnt(path->mnt, path->dentry); if (!mounted) break; path->mnt = &mounted->mnt; path->dentry = mounted->mnt.mnt_root; nd->flags |= LOOKUP_JUMPED; *seqp = read_seqcount_begin(&path->dentry->d_seq); /* * Update the inode too. We don't need to re-check the * dentry sequence number here after this d_inode read, * because a mount-point is always pinned. */ *inode = path->dentry->d_inode; } return !read_seqretry(&mount_lock, nd->m_seq) && !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT); } static int follow_dotdot_rcu(struct nameidata *nd) { struct inode *inode = nd->inode; while (1) { if (path_equal(&nd->path, &nd->root)) break; if (nd->path.dentry != nd->path.mnt->mnt_root) { struct dentry *old = nd->path.dentry; struct dentry *parent = old->d_parent; unsigned seq; inode = parent->d_inode; seq = read_seqcount_begin(&parent->d_seq); if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq))) return -ECHILD; nd->path.dentry = parent; nd->seq = seq; if (unlikely(!path_connected(&nd->path))) return -ECHILD; break; } else { struct mount *mnt = real_mount(nd->path.mnt); struct mount *mparent = mnt->mnt_parent; struct dentry *mountpoint = mnt->mnt_mountpoint; struct inode *inode2 = mountpoint->d_inode; unsigned seq = read_seqcount_begin(&mountpoint->d_seq); if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) return -ECHILD; if (&mparent->mnt == nd->path.mnt) break; /* we know that mountpoint was pinned */ nd->path.dentry = mountpoint; nd->path.mnt = &mparent->mnt; inode = inode2; nd->seq = seq; } } while (unlikely(d_mountpoint(nd->path.dentry))) { struct mount *mounted; mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry); if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) return -ECHILD; if (!mounted) break; nd->path.mnt = &mounted->mnt; nd->path.dentry = mounted->mnt.mnt_root; inode = nd->path.dentry->d_inode; nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); } nd->inode = inode; return 0; } /* * Follow down to the covering mount currently visible to userspace. At each * point, the filesystem owning that dentry may be queried as to whether the * caller is permitted to proceed or not. */ int follow_down(struct path *path) { unsigned managed; int ret; while (managed = READ_ONCE(path->dentry->d_flags), unlikely(managed & DCACHE_MANAGED_DENTRY)) { /* Allow the filesystem to manage the transit without i_mutex * being held. * * We indicate to the filesystem if someone is trying to mount * something here. This gives autofs the chance to deny anyone * other than its daemon the right to mount on its * superstructure. * * The filesystem may sleep at this point. */ if (managed & DCACHE_MANAGE_TRANSIT) { BUG_ON(!path->dentry->d_op); BUG_ON(!path->dentry->d_op->d_manage); ret = path->dentry->d_op->d_manage(path, false); if (ret < 0) return ret == -EISDIR ? 0 : ret; } /* Transit to a mounted filesystem. */ if (managed & DCACHE_MOUNTED) { struct vfsmount *mounted = lookup_mnt(path); if (!mounted) break; dput(path->dentry); mntput(path->mnt); path->mnt = mounted; path->dentry = dget(mounted->mnt_root); continue; } /* Don't handle automount points here */ break; } return 0; } EXPORT_SYMBOL(follow_down); /* * Skip to top of mountpoint pile in refwalk mode for follow_dotdot() */ static void follow_mount(struct path *path) { while (d_mountpoint(path->dentry)) { struct vfsmount *mounted = lookup_mnt(path); if (!mounted) break; dput(path->dentry); mntput(path->mnt); path->mnt = mounted; path->dentry = dget(mounted->mnt_root); } } static int path_parent_directory(struct path *path) { struct dentry *old = path->dentry; /* rare case of legitimate dget_parent()... */ path->dentry = dget_parent(path->dentry); dput(old); if (unlikely(!path_connected(path))) return -ENOENT; return 0; } static int follow_dotdot(struct nameidata *nd) { while(1) { if (path_equal(&nd->path, &nd->root)) break; if (nd->path.dentry != nd->path.mnt->mnt_root) { int ret = path_parent_directory(&nd->path); if (ret) return ret; break; } if (!follow_up(&nd->path)) break; } follow_mount(&nd->path); nd->inode = nd->path.dentry->d_inode; return 0; } /* * This looks up the name in dcache and possibly revalidates the found dentry. * NULL is returned if the dentry does not exist in the cache. */ static struct dentry *lookup_dcache(const struct qstr *name, struct dentry *dir, unsigned int flags) { struct dentry *dentry = d_lookup(dir, name); if (dentry) { int error = d_revalidate(dentry, flags); if (unlikely(error <= 0)) { if (!error) d_invalidate(dentry); dput(dentry); return ERR_PTR(error); } } return dentry; } /* * Parent directory has inode locked exclusive. This is one * and only case when ->lookup() gets called on non in-lookup * dentries - as the matter of fact, this only gets called * when directory is guaranteed to have no in-lookup children * at all. */ static struct dentry *__lookup_hash(const struct qstr *name, struct dentry *base, unsigned int flags) { struct dentry *dentry = lookup_dcache(name, base, flags); struct dentry *old; struct inode *dir = base->d_inode; if (dentry) return dentry; /* Don't create child dentry for a dead directory. */ if (unlikely(IS_DEADDIR(dir))) return ERR_PTR(-ENOENT); dentry = d_alloc(base, name); if (unlikely(!dentry)) return ERR_PTR(-ENOMEM); old = dir->i_op->lookup(dir, dentry, flags); if (unlikely(old)) { dput(dentry); dentry = old; } return dentry; } static int lookup_fast(struct nameidata *nd, struct path *path, struct inode **inode, unsigned *seqp) { struct vfsmount *mnt = nd->path.mnt; struct dentry *dentry, *parent = nd->path.dentry; int status = 1; int err; /* * Rename seqlock is not required here because in the off chance * of a false negative due to a concurrent rename, the caller is * going to fall back to non-racy lookup. */ if (nd->flags & LOOKUP_RCU) { unsigned seq; bool negative; dentry = __d_lookup_rcu(parent, &nd->last, &seq); if (unlikely(!dentry)) { if (unlazy_walk(nd)) return -ECHILD; return 0; } /* * This sequence count validates that the inode matches * the dentry name information from lookup. */ *inode = d_backing_inode(dentry); negative = d_is_negative(dentry); if (unlikely(read_seqcount_retry(&dentry->d_seq, seq))) return -ECHILD; /* * This sequence count validates that the parent had no * changes while we did the lookup of the dentry above. * * The memory barrier in read_seqcount_begin of child is * enough, we can use __read_seqcount_retry here. */ if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq))) return -ECHILD; *seqp = seq; status = d_revalidate(dentry, nd->flags); if (likely(status > 0)) { /* * Note: do negative dentry check after revalidation in * case that drops it. */ if (unlikely(negative)) return -ENOENT; path->mnt = mnt; path->dentry = dentry; if (likely(__follow_mount_rcu(nd, path, inode, seqp))) return 1; } if (unlazy_child(nd, dentry, seq)) return -ECHILD; if (unlikely(status == -ECHILD)) /* we'd been told to redo it in non-rcu mode */ status = d_revalidate(dentry, nd->flags); } else { dentry = __d_lookup(parent, &nd->last); if (unlikely(!dentry)) return 0; status = d_revalidate(dentry, nd->flags); } if (unlikely(status <= 0)) { if (!status) d_invalidate(dentry); dput(dentry); return status; } if (unlikely(d_is_negative(dentry))) { dput(dentry); return -ENOENT; } path->mnt = mnt; path->dentry = dentry; err = follow_managed(path, nd); if (likely(err > 0)) *inode = d_backing_inode(path->dentry); return err; } /* Fast lookup failed, do it the slow way */ static struct dentry *__lookup_slow(const struct qstr *name, struct dentry *dir, unsigned int flags) { struct dentry *dentry, *old; struct inode *inode = dir->d_inode; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); /* Don't go there if it's already dead */ if (unlikely(IS_DEADDIR(inode))) return ERR_PTR(-ENOENT); again: dentry = d_alloc_parallel(dir, name, &wq); if (IS_ERR(dentry)) return dentry; if (unlikely(!d_in_lookup(dentry))) { if (!(flags & LOOKUP_NO_REVAL)) { int error = d_revalidate(dentry, flags); if (unlikely(error <= 0)) { if (!error) { d_invalidate(dentry); dput(dentry); goto again; } dput(dentry); dentry = ERR_PTR(error); } } } else { old = inode->i_op->lookup(inode, dentry, flags); d_lookup_done(dentry); if (unlikely(old)) { dput(dentry); dentry = old; } } return dentry; } static struct dentry *lookup_slow(const struct qstr *name, struct dentry *dir, unsigned int flags) { struct inode *inode = dir->d_inode; struct dentry *res; inode_lock_shared(inode); res = __lookup_slow(name, dir, flags); inode_unlock_shared(inode); return res; } static inline int may_lookup(struct nameidata *nd) { if (nd->flags & LOOKUP_RCU) { int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK); if (err != -ECHILD) return err; if (unlazy_walk(nd)) return -ECHILD; } return inode_permission(nd->inode, MAY_EXEC); } static inline int handle_dots(struct nameidata *nd, int type) { if (type == LAST_DOTDOT) { if (!nd->root.mnt) set_root(nd); if (nd->flags & LOOKUP_RCU) { return follow_dotdot_rcu(nd); } else return follow_dotdot(nd); } return 0; } static int pick_link(struct nameidata *nd, struct path *link, struct inode *inode, unsigned seq) { int error; struct saved *last; if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) { path_to_nameidata(link, nd); return -ELOOP; } if (!(nd->flags & LOOKUP_RCU)) { if (link->mnt == nd->path.mnt) mntget(link->mnt); } error = nd_alloc_stack(nd); if (unlikely(error)) { if (error == -ECHILD) { if (unlikely(!legitimize_path(nd, link, seq))) { drop_links(nd); nd->depth = 0; nd->flags &= ~LOOKUP_RCU; nd->path.mnt = NULL; nd->path.dentry = NULL; if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; rcu_read_unlock(); } else if (likely(unlazy_walk(nd)) == 0) error = nd_alloc_stack(nd); } if (error) { path_put(link); return error; } } last = nd->stack + nd->depth++; last->link = *link; clear_delayed_call(&last->done); nd->link_inode = inode; last->seq = seq; return 1; } enum {WALK_FOLLOW = 1, WALK_MORE = 2}; /* * Do we need to follow links? We _really_ want to be able * to do this check without having to look at inode->i_op, * so we keep a cache of "no, this doesn't need follow_link" * for the common case. */ static inline int step_into(struct nameidata *nd, struct path *path, int flags, struct inode *inode, unsigned seq) { if (!(flags & WALK_MORE) && nd->depth) put_link(nd); if (likely(!d_is_symlink(path->dentry)) || !(flags & WALK_FOLLOW || nd->flags & LOOKUP_FOLLOW)) { /* not a symlink or should not follow */ path_to_nameidata(path, nd); nd->inode = inode; nd->seq = seq; return 0; } /* make sure that d_is_symlink above matches inode */ if (nd->flags & LOOKUP_RCU) { if (read_seqcount_retry(&path->dentry->d_seq, seq)) return -ECHILD; } return pick_link(nd, path, inode, seq); } static int walk_component(struct nameidata *nd, int flags) { struct path path; struct inode *inode; unsigned seq; int err; /* * "." and ".." are special - ".." especially so because it has * to be able to know about the current root directory and * parent relationships. */ if (unlikely(nd->last_type != LAST_NORM)) { err = handle_dots(nd, nd->last_type); if (!(flags & WALK_MORE) && nd->depth) put_link(nd); return err; } err = lookup_fast(nd, &path, &inode, &seq); if (unlikely(err <= 0)) { if (err < 0) return err; path.dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags); if (IS_ERR(path.dentry)) return PTR_ERR(path.dentry); path.mnt = nd->path.mnt; err = follow_managed(&path, nd); if (unlikely(err < 0)) return err; if (unlikely(d_is_negative(path.dentry))) { path_to_nameidata(&path, nd); return -ENOENT; } seq = 0; /* we are already out of RCU mode */ inode = d_backing_inode(path.dentry); } return step_into(nd, &path, flags, inode, seq); } /* * We can do the critical dentry name comparison and hashing * operations one word at a time, but we are limited to: * * - Architectures with fast unaligned word accesses. We could * do a "get_unaligned()" if this helps and is sufficiently * fast. * * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we * do not trap on the (extremely unlikely) case of a page * crossing operation. * * - Furthermore, we need an efficient 64-bit compile for the * 64-bit case in order to generate the "number of bytes in * the final mask". Again, that could be replaced with a * efficient population count instruction or similar. */ #ifdef CONFIG_DCACHE_WORD_ACCESS #include <asm/word-at-a-time.h> #ifdef HASH_MIX /* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */ #elif defined(CONFIG_64BIT) /* * Register pressure in the mixing function is an issue, particularly * on 32-bit x86, but almost any function requires one state value and * one temporary. Instead, use a function designed for two state values * and no temporaries. * * This function cannot create a collision in only two iterations, so * we have two iterations to achieve avalanche. In those two iterations, * we have six layers of mixing, which is enough to spread one bit's * influence out to 2^6 = 64 state bits. * * Rotate constants are scored by considering either 64 one-bit input * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the * probability of that delta causing a change to each of the 128 output * bits, using a sample of random initial states. * * The Shannon entropy of the computed probabilities is then summed * to produce a score. Ideally, any input change has a 50% chance of * toggling any given output bit. * * Mixing scores (in bits) for (12,45): * Input delta: 1-bit 2-bit * 1 round: 713.3 42542.6 * 2 rounds: 2753.7 140389.8 * 3 rounds: 5954.1 233458.2 * 4 rounds: 7862.6 256672.2 * Perfect: 8192 258048 * (64*128) (64*63/2 * 128) */ #define HASH_MIX(x, y, a) \ ( x ^= (a), \ y ^= x, x = rol64(x,12),\ x += y, y = rol64(y,45),\ y *= 9 ) /* * Fold two longs into one 32-bit hash value. This must be fast, but * latency isn't quite as critical, as there is a fair bit of additional * work done before the hash value is used. */ static inline unsigned int fold_hash(unsigned long x, unsigned long y) { y ^= x * GOLDEN_RATIO_64; y *= GOLDEN_RATIO_64; return y >> 32; } #else /* 32-bit case */ /* * Mixing scores (in bits) for (7,20): * Input delta: 1-bit 2-bit * 1 round: 330.3 9201.6 * 2 rounds: 1246.4 25475.4 * 3 rounds: 1907.1 31295.1 * 4 rounds: 2042.3 31718.6 * Perfect: 2048 31744 * (32*64) (32*31/2 * 64) */ #define HASH_MIX(x, y, a) \ ( x ^= (a), \ y ^= x, x = rol32(x, 7),\ x += y, y = rol32(y,20),\ y *= 9 ) static inline unsigned int fold_hash(unsigned long x, unsigned long y) { /* Use arch-optimized multiply if one exists */ return __hash_32(y ^ __hash_32(x)); } #endif /* * Return the hash of a string of known length. This is carfully * designed to match hash_name(), which is the more critical function. * In particular, we must end by hashing a final word containing 0..7 * payload bytes, to match the way that hash_name() iterates until it * finds the delimiter after the name. */ unsigned int full_name_hash(const void *salt, const char *name, unsigned int len) { unsigned long a, x = 0, y = (unsigned long)salt; for (;;) { if (!len) goto done; a = load_unaligned_zeropad(name); if (len < sizeof(unsigned long)) break; HASH_MIX(x, y, a); name += sizeof(unsigned long); len -= sizeof(unsigned long); } x ^= a & bytemask_from_count(len); done: return fold_hash(x, y); } EXPORT_SYMBOL(full_name_hash); /* Return the "hash_len" (hash and length) of a null-terminated string */ u64 hashlen_string(const void *salt, const char *name) { unsigned long a = 0, x = 0, y = (unsigned long)salt; unsigned long adata, mask, len; const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; len = 0; goto inside; do { HASH_MIX(x, y, a); len += sizeof(unsigned long); inside: a = load_unaligned_zeropad(name+len); } while (!has_zero(a, &adata, &constants)); adata = prep_zero_mask(a, adata, &constants); mask = create_zero_mask(adata); x ^= a & zero_bytemask(mask); return hashlen_create(fold_hash(x, y), len + find_zero(mask)); } EXPORT_SYMBOL(hashlen_string); /* * Calculate the length and hash of the path component, and * return the "hash_len" as the result. */ static inline u64 hash_name(const void *salt, const char *name) { unsigned long a = 0, b, x = 0, y = (unsigned long)salt; unsigned long adata, bdata, mask, len; const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; len = 0; goto inside; do { HASH_MIX(x, y, a); len += sizeof(unsigned long); inside: a = load_unaligned_zeropad(name+len); b = a ^ REPEAT_BYTE('/'); } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants))); adata = prep_zero_mask(a, adata, &constants); bdata = prep_zero_mask(b, bdata, &constants); mask = create_zero_mask(adata | bdata); x ^= a & zero_bytemask(mask); return hashlen_create(fold_hash(x, y), len + find_zero(mask)); } #else /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */ /* Return the hash of a string of known length */ unsigned int full_name_hash(const void *salt, const char *name, unsigned int len) { unsigned long hash = init_name_hash(salt); while (len--) hash = partial_name_hash((unsigned char)*name++, hash); return end_name_hash(hash); } EXPORT_SYMBOL(full_name_hash); /* Return the "hash_len" (hash and length) of a null-terminated string */ u64 hashlen_string(const void *salt, const char *name) { unsigned long hash = init_name_hash(salt); unsigned long len = 0, c; c = (unsigned char)*name; while (c) { len++; hash = partial_name_hash(c, hash); c = (unsigned char)name[len]; } return hashlen_create(end_name_hash(hash), len); } EXPORT_SYMBOL(hashlen_string); /* * We know there's a real path component here of at least * one character. */ static inline u64 hash_name(const void *salt, const char *name) { unsigned long hash = init_name_hash(salt); unsigned long len = 0, c; c = (unsigned char)*name; do { len++; hash = partial_name_hash(c, hash); c = (unsigned char)name[len]; } while (c && c != '/'); return hashlen_create(end_name_hash(hash), len); } #endif /* * Name resolution. * This is the basic name resolution function, turning a pathname into * the final dentry. We expect 'base' to be positive and a directory. * * Returns 0 and nd will have valid dentry and mnt on success. * Returns error and drops reference to input namei data on failure. */ static int link_path_walk(const char *name, struct nameidata *nd) { int err; if (IS_ERR(name)) return PTR_ERR(name); while (*name=='/') name++; if (!*name) return 0; /* At this point we know we have a real path component. */ for(;;) { u64 hash_len; int type; err = may_lookup(nd); if (err) return err; hash_len = hash_name(nd->path.dentry, name); type = LAST_NORM; if (name[0] == '.') switch (hashlen_len(hash_len)) { case 2: if (name[1] == '.') { type = LAST_DOTDOT; nd->flags |= LOOKUP_JUMPED; } break; case 1: type = LAST_DOT; } if (likely(type == LAST_NORM)) { struct dentry *parent = nd->path.dentry; nd->flags &= ~LOOKUP_JUMPED; if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { struct qstr this = { { .hash_len = hash_len }, .name = name }; err = parent->d_op->d_hash(parent, &this); if (err < 0) return err; hash_len = this.hash_len; name = this.name; } } nd->last.hash_len = hash_len; nd->last.name = name; nd->last_type = type; name += hashlen_len(hash_len); if (!*name) goto OK; /* * If it wasn't NUL, we know it was '/'. Skip that * slash, and continue until no more slashes. */ do { name++; } while (unlikely(*name == '/')); if (unlikely(!*name)) { OK: /* pathname body, done */ if (!nd->depth) return 0; name = nd->stack[nd->depth - 1].name; /* trailing symlink, done */ if (!name) return 0; /* last component of nested symlink */ err = walk_component(nd, WALK_FOLLOW); } else { /* not the last component */ err = walk_component(nd, WALK_FOLLOW | WALK_MORE); } if (err < 0) return err; if (err) { const char *s = get_link(nd); if (IS_ERR(s)) return PTR_ERR(s); err = 0; if (unlikely(!s)) { /* jumped */ put_link(nd); } else { nd->stack[nd->depth - 1].name = name; name = s; continue; } } if (unlikely(!d_can_lookup(nd->path.dentry))) { if (nd->flags & LOOKUP_RCU) { if (unlazy_walk(nd)) return -ECHILD; } return -ENOTDIR; } } } /* must be paired with terminate_walk() */ static const char *path_init(struct nameidata *nd, unsigned flags) { const char *s = nd->name->name; if (!*s) flags &= ~LOOKUP_RCU; if (flags & LOOKUP_RCU) rcu_read_lock(); nd->last_type = LAST_ROOT; /* if there are only slashes... */ nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT; nd->depth = 0; if (flags & LOOKUP_ROOT) { struct dentry *root = nd->root.dentry; struct inode *inode = root->d_inode; if (*s && unlikely(!d_can_lookup(root))) return ERR_PTR(-ENOTDIR); nd->path = nd->root; nd->inode = inode; if (flags & LOOKUP_RCU) { nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); nd->root_seq = nd->seq; nd->m_seq = read_seqbegin(&mount_lock); } else { path_get(&nd->path); } return s; } nd->root.mnt = NULL; nd->path.mnt = NULL; nd->path.dentry = NULL; nd->m_seq = read_seqbegin(&mount_lock); if (*s == '/') { set_root(nd); if (likely(!nd_jump_root(nd))) return s; return ERR_PTR(-ECHILD); } else if (nd->dfd == AT_FDCWD) { if (flags & LOOKUP_RCU) { struct fs_struct *fs = current->fs; unsigned seq; do { seq = read_seqcount_begin(&fs->seq); nd->path = fs->pwd; nd->inode = nd->path.dentry->d_inode; nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); } while (read_seqcount_retry(&fs->seq, seq)); } else { get_fs_pwd(current->fs, &nd->path); nd->inode = nd->path.dentry->d_inode; } return s; } else { /* Caller must check execute permissions on the starting path component */ struct fd f = fdget_raw(nd->dfd); struct dentry *dentry; if (!f.file) return ERR_PTR(-EBADF); dentry = f.file->f_path.dentry; if (*s && unlikely(!d_can_lookup(dentry))) { fdput(f); return ERR_PTR(-ENOTDIR); } nd->path = f.file->f_path; if (flags & LOOKUP_RCU) { nd->inode = nd->path.dentry->d_inode; nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); } else { path_get(&nd->path); nd->inode = nd->path.dentry->d_inode; } fdput(f); return s; } } static const char *trailing_symlink(struct nameidata *nd) { const char *s; int error = may_follow_link(nd); if (unlikely(error)) return ERR_PTR(error); nd->flags |= LOOKUP_PARENT; nd->stack[0].name = NULL; s = get_link(nd); return s ? s : ""; } static inline int lookup_last(struct nameidata *nd) { if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; nd->flags &= ~LOOKUP_PARENT; return walk_component(nd, 0); } static int handle_lookup_down(struct nameidata *nd) { struct path path = nd->path; struct inode *inode = nd->inode; unsigned seq = nd->seq; int err; if (nd->flags & LOOKUP_RCU) { /* * don't bother with unlazy_walk on failure - we are * at the very beginning of walk, so we lose nothing * if we simply redo everything in non-RCU mode */ if (unlikely(!__follow_mount_rcu(nd, &path, &inode, &seq))) return -ECHILD; } else { dget(path.dentry); err = follow_managed(&path, nd); if (unlikely(err < 0)) return err; inode = d_backing_inode(path.dentry); seq = 0; } path_to_nameidata(&path, nd); nd->inode = inode; nd->seq = seq; return 0; } /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path) { const char *s = path_init(nd, flags); int err; if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) { err = handle_lookup_down(nd); if (unlikely(err < 0)) s = ERR_PTR(err); } while (!(err = link_path_walk(s, nd)) && ((err = lookup_last(nd)) > 0)) { s = trailing_symlink(nd); } if (!err) err = complete_walk(nd); if (!err && nd->flags & LOOKUP_DIRECTORY) if (!d_can_lookup(nd->path.dentry)) err = -ENOTDIR; if (!err) { *path = nd->path; nd->path.mnt = NULL; nd->path.dentry = NULL; } terminate_walk(nd); return err; } static int filename_lookup(int dfd, struct filename *name, unsigned flags, struct path *path, struct path *root) { int retval; struct nameidata nd; if (IS_ERR(name)) return PTR_ERR(name); if (unlikely(root)) { nd.root = *root; flags |= LOOKUP_ROOT; } set_nameidata(&nd, dfd, name); retval = path_lookupat(&nd, flags | LOOKUP_RCU, path); if (unlikely(retval == -ECHILD)) retval = path_lookupat(&nd, flags, path); if (unlikely(retval == -ESTALE)) retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path); if (likely(!retval)) audit_inode(name, path->dentry, flags & LOOKUP_PARENT); restore_nameidata(); putname(name); return retval; } /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ static int path_parentat(struct nameidata *nd, unsigned flags, struct path *parent) { const char *s = path_init(nd, flags); int err = link_path_walk(s, nd); if (!err) err = complete_walk(nd); if (!err) { *parent = nd->path; nd->path.mnt = NULL; nd->path.dentry = NULL; } terminate_walk(nd); return err; } static struct filename *filename_parentat(int dfd, struct filename *name, unsigned int flags, struct path *parent, struct qstr *last, int *type) { int retval; struct nameidata nd; if (IS_ERR(name)) return name; set_nameidata(&nd, dfd, name); retval = path_parentat(&nd, flags | LOOKUP_RCU, parent); if (unlikely(retval == -ECHILD)) retval = path_parentat(&nd, flags, parent); if (unlikely(retval == -ESTALE)) retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent); if (likely(!retval)) { *last = nd.last; *type = nd.last_type; audit_inode(name, parent->dentry, LOOKUP_PARENT); } else { putname(name); name = ERR_PTR(retval); } restore_nameidata(); return name; } /* does lookup, returns the object with parent locked */ struct dentry *kern_path_locked(const char *name, struct path *path) { struct filename *filename; struct dentry *d; struct qstr last; int type; filename = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path, &last, &type); if (IS_ERR(filename)) return ERR_CAST(filename); if (unlikely(type != LAST_NORM)) { path_put(path); putname(filename); return ERR_PTR(-EINVAL); } inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); d = __lookup_hash(&last, path->dentry, 0); if (IS_ERR(d)) { inode_unlock(path->dentry->d_inode); path_put(path); } putname(filename); return d; } int kern_path(const char *name, unsigned int flags, struct path *path) { return filename_lookup(AT_FDCWD, getname_kernel(name), flags, path, NULL); } EXPORT_SYMBOL(kern_path); /** * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair * @dentry: pointer to dentry of the base directory * @mnt: pointer to vfs mount of the base directory * @name: pointer to file name * @flags: lookup flags * @path: pointer to struct path to fill */ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, const char *name, unsigned int flags, struct path *path) { struct path root = {.mnt = mnt, .dentry = dentry}; /* the first argument of filename_lookup() is ignored with root */ return filename_lookup(AT_FDCWD, getname_kernel(name), flags , path, &root); } EXPORT_SYMBOL(vfs_path_lookup); static int lookup_one_len_common(const char *name, struct dentry *base, int len, struct qstr *this) { this->name = name; this->len = len; this->hash = full_name_hash(base, name, len); if (!len) return -EACCES; if (unlikely(name[0] == '.')) { if (len < 2 || (len == 2 && name[1] == '.')) return -EACCES; } while (len--) { unsigned int c = *(const unsigned char *)name++; if (c == '/' || c == '\0') return -EACCES; } /* * See if the low-level filesystem might want * to use its own hash.. */ if (base->d_flags & DCACHE_OP_HASH) { int err = base->d_op->d_hash(base, this); if (err < 0) return err; } return inode_permission(base->d_inode, MAY_EXEC); } /** * try_lookup_one_len - filesystem helper to lookup single pathname component * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * Look up a dentry by name in the dcache, returning NULL if it does not * currently exist. The function does not try to create a dentry. * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * The caller must hold base->i_mutex. */ struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len) { struct qstr this; int err; WARN_ON_ONCE(!inode_is_locked(base->d_inode)); err = lookup_one_len_common(name, base, len, &this); if (err) return ERR_PTR(err); return lookup_dcache(&this, base, 0); } EXPORT_SYMBOL(try_lookup_one_len); /** * lookup_one_len - filesystem helper to lookup single pathname component * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * The caller must hold base->i_mutex. */ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) { struct dentry *dentry; struct qstr this; int err; WARN_ON_ONCE(!inode_is_locked(base->d_inode)); err = lookup_one_len_common(name, base, len, &this); if (err) return ERR_PTR(err); dentry = lookup_dcache(&this, base, 0); return dentry ? dentry : __lookup_slow(&this, base, 0); } EXPORT_SYMBOL(lookup_one_len); /** * lookup_one_len_unlocked - filesystem helper to lookup single pathname component * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * Unlike lookup_one_len, it should be called without the parent * i_mutex held, and will take the i_mutex itself if necessary. */ struct dentry *lookup_one_len_unlocked(const char *name, struct dentry *base, int len) { struct qstr this; int err; struct dentry *ret; err = lookup_one_len_common(name, base, len, &this); if (err) return ERR_PTR(err); ret = lookup_dcache(&this, base, 0); if (!ret) ret = lookup_slow(&this, base, 0); return ret; } EXPORT_SYMBOL(lookup_one_len_unlocked); #ifdef CONFIG_UNIX98_PTYS int path_pts(struct path *path) { /* Find something mounted on "pts" in the same directory as * the input path. */ struct dentry *child, *parent; struct qstr this; int ret; ret = path_parent_directory(path); if (ret) return ret; parent = path->dentry; this.name = "pts"; this.len = 3; child = d_hash_and_lookup(parent, &this); if (!child) return -ENOENT; path->dentry = child; dput(parent); follow_mount(path); return 0; } #endif int user_path_at_empty(int dfd, const char __user *name, unsigned flags, struct path *path, int *empty) { return filename_lookup(dfd, getname_flags(name, flags, empty), flags, path, NULL); } EXPORT_SYMBOL(user_path_at_empty); /** * mountpoint_last - look up last component for umount * @nd: pathwalk nameidata - currently pointing at parent directory of "last" * * This is a special lookup_last function just for umount. In this case, we * need to resolve the path without doing any revalidation. * * The nameidata should be the result of doing a LOOKUP_PARENT pathwalk. Since * mountpoints are always pinned in the dcache, their ancestors are too. Thus, * in almost all cases, this lookup will be served out of the dcache. The only * cases where it won't are if nd->last refers to a symlink or the path is * bogus and it doesn't exist. * * Returns: * -error: if there was an error during lookup. This includes -ENOENT if the * lookup found a negative dentry. * * 0: if we successfully resolved nd->last and found it to not to be a * symlink that needs to be followed. * * 1: if we successfully resolved nd->last and found it to be a symlink * that needs to be followed. */ static int mountpoint_last(struct nameidata *nd) { int error = 0; struct dentry *dir = nd->path.dentry; struct path path; /* If we're in rcuwalk, drop out of it to handle last component */ if (nd->flags & LOOKUP_RCU) { if (unlazy_walk(nd)) return -ECHILD; } nd->flags &= ~LOOKUP_PARENT; if (unlikely(nd->last_type != LAST_NORM)) { error = handle_dots(nd, nd->last_type); if (error) return error; path.dentry = dget(nd->path.dentry); } else { path.dentry = d_lookup(dir, &nd->last); if (!path.dentry) { /* * No cached dentry. Mounted dentries are pinned in the * cache, so that means that this dentry is probably * a symlink or the path doesn't actually point * to a mounted dentry. */ path.dentry = lookup_slow(&nd->last, dir, nd->flags | LOOKUP_NO_REVAL); if (IS_ERR(path.dentry)) return PTR_ERR(path.dentry); } } if (d_is_negative(path.dentry)) { dput(path.dentry); return -ENOENT; } path.mnt = nd->path.mnt; return step_into(nd, &path, 0, d_backing_inode(path.dentry), 0); } /** * path_mountpoint - look up a path to be umounted * @nd: lookup context * @flags: lookup flags * @path: pointer to container for result * * Look up the given name, but don't attempt to revalidate the last component. * Returns 0 and "path" will be valid on success; Returns error otherwise. */ static int path_mountpoint(struct nameidata *nd, unsigned flags, struct path *path) { const char *s = path_init(nd, flags); int err; while (!(err = link_path_walk(s, nd)) && (err = mountpoint_last(nd)) > 0) { s = trailing_symlink(nd); } if (!err) { *path = nd->path; nd->path.mnt = NULL; nd->path.dentry = NULL; follow_mount(path); } terminate_walk(nd); return err; } static int filename_mountpoint(int dfd, struct filename *name, struct path *path, unsigned int flags) { struct nameidata nd; int error; if (IS_ERR(name)) return PTR_ERR(name); set_nameidata(&nd, dfd, name); error = path_mountpoint(&nd, flags | LOOKUP_RCU, path); if (unlikely(error == -ECHILD)) error = path_mountpoint(&nd, flags, path); if (unlikely(error == -ESTALE)) error = path_mountpoint(&nd, flags | LOOKUP_REVAL, path); if (likely(!error)) audit_inode(name, path->dentry, 0); restore_nameidata(); putname(name); return error; } /** * user_path_mountpoint_at - lookup a path from userland in order to umount it * @dfd: directory file descriptor * @name: pathname from userland * @flags: lookup flags * @path: pointer to container to hold result * * A umount is a special case for path walking. We're not actually interested * in the inode in this situation, and ESTALE errors can be a problem. We * simply want track down the dentry and vfsmount attached at the mountpoint * and avoid revalidating the last component. * * Returns 0 and populates "path" on success. */ int user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags, struct path *path) { return filename_mountpoint(dfd, getname(name), path, flags); } int kern_path_mountpoint(int dfd, const char *name, struct path *path, unsigned int flags) { return filename_mountpoint(dfd, getname_kernel(name), path, flags); } EXPORT_SYMBOL(kern_path_mountpoint); int __check_sticky(struct inode *dir, struct inode *inode) { kuid_t fsuid = current_fsuid(); if (uid_eq(inode->i_uid, fsuid)) return 0; if (uid_eq(dir->i_uid, fsuid)) return 0; return !capable_wrt_inode_uidgid(inode, CAP_FOWNER); } EXPORT_SYMBOL(__check_sticky); /* * Check whether we can remove a link victim from directory dir, check * whether the type of victim is right. * 1. We can't do it if dir is read-only (done in permission()) * 2. We should have write and exec permissions on dir * 3. We can't remove anything from append-only dir * 4. We can't do anything with immutable dir (done in permission()) * 5. If the sticky bit on dir is set we should either * a. be owner of dir, or * b. be owner of victim, or * c. have CAP_FOWNER capability * 6. If the victim is append-only or immutable we can't do antyhing with * links pointing to it. * 7. If the victim has an unknown uid or gid we can't change the inode. * 8. If we were asked to remove a directory and victim isn't one - ENOTDIR. * 9. If we were asked to remove a non-directory and victim isn't one - EISDIR. * 10. We can't remove a root or mountpoint. * 11. We don't allow removal of NFS sillyrenamed files; it's handled by * nfs_async_unlink(). */ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) { struct inode *inode = d_backing_inode(victim); int error; if (d_is_negative(victim)) return -ENOENT; BUG_ON(!inode); BUG_ON(victim->d_parent->d_inode != dir); /* Inode writeback is not safe when the uid or gid are invalid. */ if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid)) return -EOVERFLOW; audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); error = inode_permission(dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (IS_APPEND(dir)) return -EPERM; if (check_sticky(dir, inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(inode)) return -EPERM; if (isdir) { if (!d_is_dir(victim)) return -ENOTDIR; if (IS_ROOT(victim)) return -EBUSY; } else if (d_is_dir(victim)) return -EISDIR; if (IS_DEADDIR(dir)) return -ENOENT; if (victim->d_flags & DCACHE_NFSFS_RENAMED) return -EBUSY; return 0; } /* Check whether we can create an object with dentry child in directory * dir. * 1. We can't do it if child already exists (open has special treatment for * this case, but since we are inlined it's OK) * 2. We can't do it if dir is read-only (done in permission()) * 3. We can't do it if the fs can't represent the fsuid or fsgid. * 4. We should have write and exec permissions on dir * 5. We can't do it if dir is immutable (done in permission()) */ static inline int may_create(struct inode *dir, struct dentry *child) { struct user_namespace *s_user_ns; audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE); if (child->d_inode) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; s_user_ns = dir->i_sb->s_user_ns; if (!kuid_has_mapping(s_user_ns, current_fsuid()) || !kgid_has_mapping(s_user_ns, current_fsgid())) return -EOVERFLOW; return inode_permission(dir, MAY_WRITE | MAY_EXEC); } /* * p1 and p2 should be directories on the same fs. */ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) { struct dentry *p; if (p1 == p2) { inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); return NULL; } mutex_lock(&p1->d_sb->s_vfs_rename_mutex); p = d_ancestor(p2, p1); if (p) { inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); inode_lock_nested(p1->d_inode, I_MUTEX_CHILD); return p; } p = d_ancestor(p1, p2); if (p) { inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); inode_lock_nested(p2->d_inode, I_MUTEX_CHILD); return p; } inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); return NULL; } EXPORT_SYMBOL(lock_rename); void unlock_rename(struct dentry *p1, struct dentry *p2) { inode_unlock(p1->d_inode); if (p1 != p2) { inode_unlock(p2->d_inode); mutex_unlock(&p1->d_sb->s_vfs_rename_mutex); } } EXPORT_SYMBOL(unlock_rename); int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool want_excl) { int error = may_create(dir, dentry); if (error) return error; if (!dir->i_op->create) return -EACCES; /* shouldn't it be ENOSYS? */ mode &= S_IALLUGO; mode |= S_IFREG; error = security_inode_create(dir, dentry, mode); if (error) return error; error = dir->i_op->create(dir, dentry, mode, want_excl); if (!error) fsnotify_create(dir, dentry); return error; } EXPORT_SYMBOL(vfs_create); int vfs_mkobj(struct dentry *dentry, umode_t mode, int (*f)(struct dentry *, umode_t, void *), void *arg) { struct inode *dir = dentry->d_parent->d_inode; int error = may_create(dir, dentry); if (error) return error; mode &= S_IALLUGO; mode |= S_IFREG; error = security_inode_create(dir, dentry, mode); if (error) return error; error = f(dentry, mode, arg); if (!error) fsnotify_create(dir, dentry); return error; } EXPORT_SYMBOL(vfs_mkobj); bool may_open_dev(const struct path *path) { return !(path->mnt->mnt_flags & MNT_NODEV) && !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV); } static int may_open(const struct path *path, int acc_mode, int flag) { struct dentry *dentry = path->dentry; struct inode *inode = dentry->d_inode; int error; if (!inode) return -ENOENT; switch (inode->i_mode & S_IFMT) { case S_IFLNK: return -ELOOP; case S_IFDIR: if (acc_mode & MAY_WRITE) return -EISDIR; break; case S_IFBLK: case S_IFCHR: if (!may_open_dev(path)) return -EACCES; /*FALLTHRU*/ case S_IFIFO: case S_IFSOCK: flag &= ~O_TRUNC; break; } error = inode_permission(inode, MAY_OPEN | acc_mode); if (error) return error; /* * An append-only file must be opened in append mode for writing. */ if (IS_APPEND(inode)) { if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND)) return -EPERM; if (flag & O_TRUNC) return -EPERM; } /* O_NOATIME can only be set by the owner or superuser */ if (flag & O_NOATIME && !inode_owner_or_capable(inode)) return -EPERM; return 0; } static int handle_truncate(struct file *filp) { const struct path *path = &filp->f_path; struct inode *inode = path->dentry->d_inode; int error = get_write_access(inode); if (error) return error; /* * Refuse to truncate files with mandatory locks held on them. */ error = locks_verify_locked(filp); if (!error) error = security_path_truncate(path); if (!error) { error = do_truncate(path->dentry, 0, ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, filp); } put_write_access(inode); return error; } static inline int open_to_namei_flags(int flag) { if ((flag & O_ACCMODE) == 3) flag--; return flag; } static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t mode) { struct user_namespace *s_user_ns; int error = security_path_mknod(dir, dentry, mode, 0); if (error) return error; s_user_ns = dir->dentry->d_sb->s_user_ns; if (!kuid_has_mapping(s_user_ns, current_fsuid()) || !kgid_has_mapping(s_user_ns, current_fsgid())) return -EOVERFLOW; error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC); if (error) return error; return security_inode_create(dir->dentry->d_inode, dentry, mode); } /* * Attempt to atomically look up, create and open a file from a negative * dentry. * * Returns 0 if successful. The file will have been created and attached to * @file by the filesystem calling finish_open(). * * If the file was looked up only or didn't need creating, FMODE_OPENED won't * be set. The caller will need to perform the open themselves. @path will * have been updated to point to the new dentry. This may be negative. * * Returns an error code otherwise. */ static int atomic_open(struct nameidata *nd, struct dentry *dentry, struct path *path, struct file *file, const struct open_flags *op, int open_flag, umode_t mode) { struct dentry *const DENTRY_NOT_SET = (void *) -1UL; struct inode *dir = nd->path.dentry->d_inode; int error; if (!(~open_flag & (O_EXCL | O_CREAT))) /* both O_EXCL and O_CREAT */ open_flag &= ~O_TRUNC; if (nd->flags & LOOKUP_DIRECTORY) open_flag |= O_DIRECTORY; file->f_path.dentry = DENTRY_NOT_SET; file->f_path.mnt = nd->path.mnt; error = dir->i_op->atomic_open(dir, dentry, file, open_to_namei_flags(open_flag), mode); d_lookup_done(dentry); if (!error) { if (file->f_mode & FMODE_OPENED) { /* * We didn't have the inode before the open, so check open * permission here. */ int acc_mode = op->acc_mode; if (file->f_mode & FMODE_CREATED) { WARN_ON(!(open_flag & O_CREAT)); fsnotify_create(dir, dentry); acc_mode = 0; } error = may_open(&file->f_path, acc_mode, open_flag); if (WARN_ON(error > 0)) error = -EINVAL; } else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) { error = -EIO; } else { if (file->f_path.dentry) { dput(dentry); dentry = file->f_path.dentry; } if (file->f_mode & FMODE_CREATED) fsnotify_create(dir, dentry); if (unlikely(d_is_negative(dentry))) { error = -ENOENT; } else { path->dentry = dentry; path->mnt = nd->path.mnt; return 0; } } } dput(dentry); return error; } /* * Look up and maybe create and open the last component. * * Must be called with parent locked (exclusive in O_CREAT case). * * Returns 0 on success, that is, if * the file was successfully atomically created (if necessary) and opened, or * the file was not completely opened at this time, though lookups and * creations were performed. * These case are distinguished by presence of FMODE_OPENED on file->f_mode. * In the latter case dentry returned in @path might be negative if O_CREAT * hadn't been specified. * * An error code is returned on failure. */ static int lookup_open(struct nameidata *nd, struct path *path, struct file *file, const struct open_flags *op, bool got_write) { struct dentry *dir = nd->path.dentry; struct inode *dir_inode = dir->d_inode; int open_flag = op->open_flag; struct dentry *dentry; int error, create_error = 0; umode_t mode = op->mode; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); if (unlikely(IS_DEADDIR(dir_inode))) return -ENOENT; file->f_mode &= ~FMODE_CREATED; dentry = d_lookup(dir, &nd->last); for (;;) { if (!dentry) { dentry = d_alloc_parallel(dir, &nd->last, &wq); if (IS_ERR(dentry)) return PTR_ERR(dentry); } if (d_in_lookup(dentry)) break; error = d_revalidate(dentry, nd->flags); if (likely(error > 0)) break; if (error) goto out_dput; d_invalidate(dentry); dput(dentry); dentry = NULL; } if (dentry->d_inode) { /* Cached positive dentry: will open in f_op->open */ goto out_no_open; } /* * Checking write permission is tricky, bacuse we don't know if we are * going to actually need it: O_CREAT opens should work as long as the * file exists. But checking existence breaks atomicity. The trick is * to check access and if not granted clear O_CREAT from the flags. * * Another problem is returing the "right" error value (e.g. for an * O_EXCL open we want to return EEXIST not EROFS). */ if (open_flag & O_CREAT) { if (!IS_POSIXACL(dir->d_inode)) mode &= ~current_umask(); if (unlikely(!got_write)) { create_error = -EROFS; open_flag &= ~O_CREAT; if (open_flag & (O_EXCL | O_TRUNC)) goto no_open; /* No side effects, safe to clear O_CREAT */ } else { create_error = may_o_create(&nd->path, dentry, mode); if (create_error) { open_flag &= ~O_CREAT; if (open_flag & O_EXCL) goto no_open; } } } else if ((open_flag & (O_TRUNC|O_WRONLY|O_RDWR)) && unlikely(!got_write)) { /* * No O_CREATE -> atomicity not a requirement -> fall * back to lookup + open */ goto no_open; } if (dir_inode->i_op->atomic_open) { error = atomic_open(nd, dentry, path, file, op, open_flag, mode); if (unlikely(error == -ENOENT) && create_error) error = create_error; return error; } no_open: if (d_in_lookup(dentry)) { struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry, nd->flags); d_lookup_done(dentry); if (unlikely(res)) { if (IS_ERR(res)) { error = PTR_ERR(res); goto out_dput; } dput(dentry); dentry = res; } } /* Negative dentry, just create the file */ if (!dentry->d_inode && (open_flag & O_CREAT)) { file->f_mode |= FMODE_CREATED; audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE); if (!dir_inode->i_op->create) { error = -EACCES; goto out_dput; } error = dir_inode->i_op->create(dir_inode, dentry, mode, open_flag & O_EXCL); if (error) goto out_dput; fsnotify_create(dir_inode, dentry); } if (unlikely(create_error) && !dentry->d_inode) { error = create_error; goto out_dput; } out_no_open: path->dentry = dentry; path->mnt = nd->path.mnt; return 0; out_dput: dput(dentry); return error; } /* * Handle the last step of open() */ static int do_last(struct nameidata *nd, struct file *file, const struct open_flags *op) { struct dentry *dir = nd->path.dentry; kuid_t dir_uid = nd->inode->i_uid; umode_t dir_mode = nd->inode->i_mode; int open_flag = op->open_flag; bool will_truncate = (open_flag & O_TRUNC) != 0; bool got_write = false; int acc_mode = op->acc_mode; unsigned seq; struct inode *inode; struct path path; int error; nd->flags &= ~LOOKUP_PARENT; nd->flags |= op->intent; if (nd->last_type != LAST_NORM) { error = handle_dots(nd, nd->last_type); if (unlikely(error)) return error; goto finish_open; } if (!(open_flag & O_CREAT)) { if (nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; /* we _can_ be in RCU mode here */ error = lookup_fast(nd, &path, &inode, &seq); if (likely(error > 0)) goto finish_lookup; if (error < 0) return error; BUG_ON(nd->inode != dir->d_inode); BUG_ON(nd->flags & LOOKUP_RCU); } else { /* create side of things */ /* * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED * has been cleared when we got to the last component we are * about to look up */ error = complete_walk(nd); if (error) return error; audit_inode(nd->name, dir, LOOKUP_PARENT); /* trailing slashes? */ if (unlikely(nd->last.name[nd->last.len])) return -EISDIR; } if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { error = mnt_want_write(nd->path.mnt); if (!error) got_write = true; /* * do _not_ fail yet - we might not need that or fail with * a different error; let lookup_open() decide; we'll be * dropping this one anyway. */ } if (open_flag & O_CREAT) inode_lock(dir->d_inode); else inode_lock_shared(dir->d_inode); error = lookup_open(nd, &path, file, op, got_write); if (open_flag & O_CREAT) inode_unlock(dir->d_inode); else inode_unlock_shared(dir->d_inode); if (error) goto out; if (file->f_mode & FMODE_OPENED) { if ((file->f_mode & FMODE_CREATED) || !S_ISREG(file_inode(file)->i_mode)) will_truncate = false; audit_inode(nd->name, file->f_path.dentry, 0); goto opened; } if (file->f_mode & FMODE_CREATED) { /* Don't check for write permission, don't truncate */ open_flag &= ~O_TRUNC; will_truncate = false; acc_mode = 0; path_to_nameidata(&path, nd); goto finish_open_created; } /* * If atomic_open() acquired write access it is dropped now due to * possible mount and symlink following (this might be optimized away if * necessary...) */ if (got_write) { mnt_drop_write(nd->path.mnt); got_write = false; } error = follow_managed(&path, nd); if (unlikely(error < 0)) return error; if (unlikely(d_is_negative(path.dentry))) { path_to_nameidata(&path, nd); return -ENOENT; } /* * create/update audit record if it already exists. */ audit_inode(nd->name, path.dentry, 0); if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) { path_to_nameidata(&path, nd); return -EEXIST; } seq = 0; /* out of RCU mode, so the value doesn't matter */ inode = d_backing_inode(path.dentry); finish_lookup: error = step_into(nd, &path, 0, inode, seq); if (unlikely(error)) return error; finish_open: /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ error = complete_walk(nd); if (error) return error; audit_inode(nd->name, nd->path.dentry, 0); if (open_flag & O_CREAT) { error = -EISDIR; if (d_is_dir(nd->path.dentry)) goto out; error = may_create_in_sticky(dir_mode, dir_uid, d_backing_inode(nd->path.dentry)); if (unlikely(error)) goto out; } error = -ENOTDIR; if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry)) goto out; if (!d_is_reg(nd->path.dentry)) will_truncate = false; if (will_truncate) { error = mnt_want_write(nd->path.mnt); if (error) goto out; got_write = true; } finish_open_created: error = may_open(&nd->path, acc_mode, open_flag); if (error) goto out; BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */ error = vfs_open(&nd->path, file); if (error) goto out; opened: error = ima_file_check(file, op->acc_mode); if (!error && will_truncate) error = handle_truncate(file); out: if (unlikely(error > 0)) { WARN_ON(1); error = -EINVAL; } if (got_write) mnt_drop_write(nd->path.mnt); return error; } struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag) { struct dentry *child = NULL; struct inode *dir = dentry->d_inode; struct inode *inode; int error; /* we want directory to be writable */ error = inode_permission(dir, MAY_WRITE | MAY_EXEC); if (error) goto out_err; error = -EOPNOTSUPP; if (!dir->i_op->tmpfile) goto out_err; error = -ENOMEM; child = d_alloc(dentry, &slash_name); if (unlikely(!child)) goto out_err; if (!IS_POSIXACL(dir)) mode &= ~current_umask(); error = dir->i_op->tmpfile(dir, child, mode); if (error) goto out_err; error = -ENOENT; inode = child->d_inode; if (unlikely(!inode)) goto out_err; if (!(open_flag & O_EXCL)) { spin_lock(&inode->i_lock); inode->i_state |= I_LINKABLE; spin_unlock(&inode->i_lock); } return child; out_err: dput(child); return ERR_PTR(error); } EXPORT_SYMBOL(vfs_tmpfile); static int do_tmpfile(struct nameidata *nd, unsigned flags, const struct open_flags *op, struct file *file) { struct dentry *child; struct path path; int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path); if (unlikely(error)) return error; error = mnt_want_write(path.mnt); if (unlikely(error)) goto out; child = vfs_tmpfile(path.dentry, op->mode, op->open_flag); error = PTR_ERR(child); if (IS_ERR(child)) goto out2; dput(path.dentry); path.dentry = child; audit_inode(nd->name, child, 0); /* Don't check for other permissions, the inode was just created */ error = may_open(&path, 0, op->open_flag); if (error) goto out2; file->f_path.mnt = path.mnt; error = finish_open(file, child, NULL); out2: mnt_drop_write(path.mnt); out: path_put(&path); return error; } static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file) { struct path path; int error = path_lookupat(nd, flags, &path); if (!error) { audit_inode(nd->name, path.dentry, 0); error = vfs_open(&path, file); path_put(&path); } return error; } static struct file *path_openat(struct nameidata *nd, const struct open_flags *op, unsigned flags) { struct file *file; int error; file = alloc_empty_file(op->open_flag, current_cred()); if (IS_ERR(file)) return file; if (unlikely(file->f_flags & __O_TMPFILE)) { error = do_tmpfile(nd, flags, op, file); } else if (unlikely(file->f_flags & O_PATH)) { error = do_o_path(nd, flags, file); } else { const char *s = path_init(nd, flags); while (!(error = link_path_walk(s, nd)) && (error = do_last(nd, file, op)) > 0) { nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); s = trailing_symlink(nd); } terminate_walk(nd); } if (likely(!error)) { if (likely(file->f_mode & FMODE_OPENED)) return file; WARN_ON(1); error = -EINVAL; } fput(file); if (error == -EOPENSTALE) { if (flags & LOOKUP_RCU) error = -ECHILD; else error = -ESTALE; } return ERR_PTR(error); } struct file *do_filp_open(int dfd, struct filename *pathname, const struct open_flags *op) { struct nameidata nd; int flags = op->lookup_flags; struct file *filp; set_nameidata(&nd, dfd, pathname); filp = path_openat(&nd, op, flags | LOOKUP_RCU); if (unlikely(filp == ERR_PTR(-ECHILD))) filp = path_openat(&nd, op, flags); if (unlikely(filp == ERR_PTR(-ESTALE))) filp = path_openat(&nd, op, flags | LOOKUP_REVAL); restore_nameidata(); return filp; } struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, const char *name, const struct open_flags *op) { struct nameidata nd; struct file *file; struct filename *filename; int flags = op->lookup_flags | LOOKUP_ROOT; nd.root.mnt = mnt; nd.root.dentry = dentry; if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN) return ERR_PTR(-ELOOP); filename = getname_kernel(name); if (IS_ERR(filename)) return ERR_CAST(filename); set_nameidata(&nd, -1, filename); file = path_openat(&nd, op, flags | LOOKUP_RCU); if (unlikely(file == ERR_PTR(-ECHILD))) file = path_openat(&nd, op, flags); if (unlikely(file == ERR_PTR(-ESTALE))) file = path_openat(&nd, op, flags | LOOKUP_REVAL); restore_nameidata(); putname(filename); return file; } static struct dentry *filename_create(int dfd, struct filename *name, struct path *path, unsigned int lookup_flags) { struct dentry *dentry = ERR_PTR(-EEXIST); struct qstr last; int type; int err2; int error; bool is_dir = (lookup_flags & LOOKUP_DIRECTORY); /* * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any * other flags passed in are ignored! */ lookup_flags &= LOOKUP_REVAL; name = filename_parentat(dfd, name, lookup_flags, path, &last, &type); if (IS_ERR(name)) return ERR_CAST(name); /* * Yucky last component or no last component at all? * (foo/., foo/.., /////) */ if (unlikely(type != LAST_NORM)) goto out; /* don't fail immediately if it's r/o, at least try to report other errors */ err2 = mnt_want_write(path->mnt); /* * Do the final lookup. */ lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL; inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); dentry = __lookup_hash(&last, path->dentry, lookup_flags); if (IS_ERR(dentry)) goto unlock; error = -EEXIST; if (d_is_positive(dentry)) goto fail; /* * Special case - lookup gave negative, but... we had foo/bar/ * From the vfs_mknod() POV we just have a negative dentry - * all is fine. Let's be bastards - you had / on the end, you've * been asking for (non-existent) directory. -ENOENT for you. */ if (unlikely(!is_dir && last.name[last.len])) { error = -ENOENT; goto fail; } if (unlikely(err2)) { error = err2; goto fail; } putname(name); return dentry; fail: dput(dentry); dentry = ERR_PTR(error); unlock: inode_unlock(path->dentry->d_inode); if (!err2) mnt_drop_write(path->mnt); out: path_put(path); putname(name); return dentry; } struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path, unsigned int lookup_flags) { return filename_create(dfd, getname_kernel(pathname), path, lookup_flags); } EXPORT_SYMBOL(kern_path_create); void done_path_create(struct path *path, struct dentry *dentry) { dput(dentry); inode_unlock(path->dentry->d_inode); mnt_drop_write(path->mnt); path_put(path); } EXPORT_SYMBOL(done_path_create); inline struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, unsigned int lookup_flags) { return filename_create(dfd, getname(pathname), path, lookup_flags); } EXPORT_SYMBOL(user_path_create); int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { int error = may_create(dir, dentry); if (error) return error; if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD)) return -EPERM; if (!dir->i_op->mknod) return -EPERM; error = devcgroup_inode_mknod(mode, dev); if (error) return error; error = security_inode_mknod(dir, dentry, mode, dev); if (error) return error; error = dir->i_op->mknod(dir, dentry, mode, dev); if (!error) fsnotify_create(dir, dentry); return error; } EXPORT_SYMBOL(vfs_mknod); static int may_mknod(umode_t mode) { switch (mode & S_IFMT) { case S_IFREG: case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK: case 0: /* zero mode translates to S_IFREG */ return 0; case S_IFDIR: return -EPERM; default: return -EINVAL; } } long do_mknodat(int dfd, const char __user *filename, umode_t mode, unsigned int dev) { struct dentry *dentry; struct path path; int error; unsigned int lookup_flags = 0; error = may_mknod(mode); if (error) return error; retry: dentry = user_path_create(dfd, filename, &path, lookup_flags); if (IS_ERR(dentry)) return PTR_ERR(dentry); if (!IS_POSIXACL(path.dentry->d_inode)) mode &= ~current_umask(); error = security_path_mknod(&path, dentry, mode, dev); if (error) goto out; switch (mode & S_IFMT) { case 0: case S_IFREG: error = vfs_create(path.dentry->d_inode,dentry,mode,true); if (!error) ima_post_path_mknod(dentry); break; case S_IFCHR: case S_IFBLK: error = vfs_mknod(path.dentry->d_inode,dentry,mode, new_decode_dev(dev)); break; case S_IFIFO: case S_IFSOCK: error = vfs_mknod(path.dentry->d_inode,dentry,mode,0); break; } out: done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } return error; } SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, unsigned int, dev) { return do_mknodat(dfd, filename, mode, dev); } SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev) { return do_mknodat(AT_FDCWD, filename, mode, dev); } int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { int error = may_create(dir, dentry); unsigned max_links = dir->i_sb->s_max_links; if (error) return error; if (!dir->i_op->mkdir) return -EPERM; mode &= (S_IRWXUGO|S_ISVTX); error = security_inode_mkdir(dir, dentry, mode); if (error) return error; if (max_links && dir->i_nlink >= max_links) return -EMLINK; error = dir->i_op->mkdir(dir, dentry, mode); if (!error) fsnotify_mkdir(dir, dentry); return error; } EXPORT_SYMBOL(vfs_mkdir); long do_mkdirat(int dfd, const char __user *pathname, umode_t mode) { struct dentry *dentry; struct path path; int error; unsigned int lookup_flags = LOOKUP_DIRECTORY; retry: dentry = user_path_create(dfd, pathname, &path, lookup_flags); if (IS_ERR(dentry)) return PTR_ERR(dentry); if (!IS_POSIXACL(path.dentry->d_inode)) mode &= ~current_umask(); error = security_path_mkdir(&path, dentry, mode); if (!error) error = vfs_mkdir(path.dentry->d_inode, dentry, mode); done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } return error; } SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) { return do_mkdirat(dfd, pathname, mode); } SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) { return do_mkdirat(AT_FDCWD, pathname, mode); } int vfs_rmdir(struct inode *dir, struct dentry *dentry) { int error = may_delete(dir, dentry, 1); if (error) return error; if (!dir->i_op->rmdir) return -EPERM; dget(dentry); inode_lock(dentry->d_inode); error = -EBUSY; if (is_local_mountpoint(dentry)) goto out; error = security_inode_rmdir(dir, dentry); if (error) goto out; error = dir->i_op->rmdir(dir, dentry); if (error) goto out; shrink_dcache_parent(dentry); dentry->d_inode->i_flags |= S_DEAD; dont_mount(dentry); detach_mounts(dentry); out: inode_unlock(dentry->d_inode); dput(dentry); if (!error) d_delete(dentry); return error; } EXPORT_SYMBOL(vfs_rmdir); long do_rmdir(int dfd, const char __user *pathname) { int error = 0; struct filename *name; struct dentry *dentry; struct path path; struct qstr last; int type; unsigned int lookup_flags = 0; retry: name = filename_parentat(dfd, getname(pathname), lookup_flags, &path, &last, &type); if (IS_ERR(name)) return PTR_ERR(name); switch (type) { case LAST_DOTDOT: error = -ENOTEMPTY; goto exit1; case LAST_DOT: error = -EINVAL; goto exit1; case LAST_ROOT: error = -EBUSY; goto exit1; } error = mnt_want_write(path.mnt); if (error) goto exit1; inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); dentry = __lookup_hash(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto exit2; if (!dentry->d_inode) { error = -ENOENT; goto exit3; } error = security_path_rmdir(&path, dentry); if (error) goto exit3; error = vfs_rmdir(path.dentry->d_inode, dentry); exit3: dput(dentry); exit2: inode_unlock(path.dentry->d_inode); mnt_drop_write(path.mnt); exit1: path_put(&path); putname(name); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } return error; } SYSCALL_DEFINE1(rmdir, const char __user *, pathname) { return do_rmdir(AT_FDCWD, pathname); } /** * vfs_unlink - unlink a filesystem object * @dir: parent directory * @dentry: victim * @delegated_inode: returns victim inode, if the inode is delegated. * * The caller must hold dir->i_mutex. * * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and * return a reference to the inode in delegated_inode. The caller * should then break the delegation on that inode and retry. Because * breaking a delegation may take a long time, the caller should drop * dir->i_mutex before doing so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. */ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode) { struct inode *target = dentry->d_inode; int error = may_delete(dir, dentry, 0); if (error) return error; if (!dir->i_op->unlink) return -EPERM; inode_lock(target); if (is_local_mountpoint(dentry)) error = -EBUSY; else { error = security_inode_unlink(dir, dentry); if (!error) { error = try_break_deleg(target, delegated_inode); if (error) goto out; error = dir->i_op->unlink(dir, dentry); if (!error) { dont_mount(dentry); detach_mounts(dentry); } } } out: inode_unlock(target); /* We don't d_delete() NFS sillyrenamed files--they still exist. */ if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) { fsnotify_link_count(target); d_delete(dentry); } return error; } EXPORT_SYMBOL(vfs_unlink); /* * Make sure that the actual truncation of the file will occur outside its * directory's i_mutex. Truncate can take a long time if there is a lot of * writeout happening, and we don't want to prevent access to the directory * while waiting on the I/O. */ long do_unlinkat(int dfd, struct filename *name) { int error; struct dentry *dentry; struct path path; struct qstr last; int type; struct inode *inode = NULL; struct inode *delegated_inode = NULL; unsigned int lookup_flags = 0; retry: name = filename_parentat(dfd, name, lookup_flags, &path, &last, &type); if (IS_ERR(name)) return PTR_ERR(name); error = -EISDIR; if (type != LAST_NORM) goto exit1; error = mnt_want_write(path.mnt); if (error) goto exit1; retry_deleg: inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); dentry = __lookup_hash(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { /* Why not before? Because we want correct error value */ if (last.name[last.len]) goto slashes; inode = dentry->d_inode; if (d_is_negative(dentry)) goto slashes; ihold(inode); error = security_path_unlink(&path, dentry); if (error) goto exit2; error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode); exit2: dput(dentry); } inode_unlock(path.dentry->d_inode); if (inode) iput(inode); /* truncate the inode here */ inode = NULL; if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } mnt_drop_write(path.mnt); exit1: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; inode = NULL; goto retry; } putname(name); return error; slashes: if (d_is_negative(dentry)) error = -ENOENT; else if (d_is_dir(dentry)) error = -EISDIR; else error = -ENOTDIR; goto exit2; } SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag) { if ((flag & ~AT_REMOVEDIR) != 0) return -EINVAL; if (flag & AT_REMOVEDIR) return do_rmdir(dfd, pathname); return do_unlinkat(dfd, getname(pathname)); } SYSCALL_DEFINE1(unlink, const char __user *, pathname) { return do_unlinkat(AT_FDCWD, getname(pathname)); } int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) { int error = may_create(dir, dentry); if (error) return error; if (!dir->i_op->symlink) return -EPERM; error = security_inode_symlink(dir, dentry, oldname); if (error) return error; error = dir->i_op->symlink(dir, dentry, oldname); if (!error) fsnotify_create(dir, dentry); return error; } EXPORT_SYMBOL(vfs_symlink); long do_symlinkat(const char __user *oldname, int newdfd, const char __user *newname) { int error; struct filename *from; struct dentry *dentry; struct path path; unsigned int lookup_flags = 0; from = getname(oldname); if (IS_ERR(from)) return PTR_ERR(from); retry: dentry = user_path_create(newdfd, newname, &path, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_putname; error = security_path_symlink(&path, dentry, from->name); if (!error) error = vfs_symlink(path.dentry->d_inode, dentry, from->name); done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out_putname: putname(from); return error; } SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, int, newdfd, const char __user *, newname) { return do_symlinkat(oldname, newdfd, newname); } SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname) { return do_symlinkat(oldname, AT_FDCWD, newname); } /** * vfs_link - create a new link * @old_dentry: object to be linked * @dir: new parent * @new_dentry: where to create the new link * @delegated_inode: returns inode needing a delegation break * * The caller must hold dir->i_mutex * * If vfs_link discovers a delegation on the to-be-linked file in need * of breaking, it will return -EWOULDBLOCK and return a reference to the * inode in delegated_inode. The caller should then break the delegation * and retry. Because breaking a delegation may take a long time, the * caller should drop the i_mutex before doing so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. */ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode) { struct inode *inode = old_dentry->d_inode; unsigned max_links = dir->i_sb->s_max_links; int error; if (!inode) return -ENOENT; error = may_create(dir, new_dentry); if (error) return error; if (dir->i_sb != inode->i_sb) return -EXDEV; /* * A link to an append-only or immutable file cannot be created. */ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return -EPERM; /* * Updating the link count will likely cause i_uid and i_gid to * be writen back improperly if their true value is unknown to * the vfs. */ if (HAS_UNMAPPED_ID(inode)) return -EPERM; if (!dir->i_op->link) return -EPERM; if (S_ISDIR(inode->i_mode)) return -EPERM; error = security_inode_link(old_dentry, dir, new_dentry); if (error) return error; inode_lock(inode); /* Make sure we don't allow creating hardlink to an unlinked file */ if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) error = -ENOENT; else if (max_links && inode->i_nlink >= max_links) error = -EMLINK; else { error = try_break_deleg(inode, delegated_inode); if (!error) error = dir->i_op->link(old_dentry, dir, new_dentry); } if (!error && (inode->i_state & I_LINKABLE)) { spin_lock(&inode->i_lock); inode->i_state &= ~I_LINKABLE; spin_unlock(&inode->i_lock); } inode_unlock(inode); if (!error) fsnotify_link(dir, inode, new_dentry); return error; } EXPORT_SYMBOL(vfs_link); /* * Hardlinks are often used in delicate situations. We avoid * security-related surprises by not following symlinks on the * newname. --KAB * * We don't follow them on the oldname either to be compatible * with linux 2.0, and to avoid hard-linking to directories * and other special files. --ADM */ int do_linkat(int olddfd, const char __user *oldname, int newdfd, const char __user *newname, int flags) { struct dentry *new_dentry; struct path old_path, new_path; struct inode *delegated_inode = NULL; int how = 0; int error; if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) return -EINVAL; /* * To use null names we require CAP_DAC_READ_SEARCH * This ensures that not everyone will be able to create * handlink using the passed filedescriptor. */ if (flags & AT_EMPTY_PATH) { if (!capable(CAP_DAC_READ_SEARCH)) return -ENOENT; how = LOOKUP_EMPTY; } if (flags & AT_SYMLINK_FOLLOW) how |= LOOKUP_FOLLOW; retry: error = user_path_at(olddfd, oldname, how, &old_path); if (error) return error; new_dentry = user_path_create(newdfd, newname, &new_path, (how & LOOKUP_REVAL)); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto out; error = -EXDEV; if (old_path.mnt != new_path.mnt) goto out_dput; error = may_linkat(&old_path); if (unlikely(error)) goto out_dput; error = security_path_link(old_path.dentry, &new_path, new_dentry); if (error) goto out_dput; error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode); out_dput: done_path_create(&new_path, new_dentry); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) { path_put(&old_path); goto retry; } } if (retry_estale(error, how)) { path_put(&old_path); how |= LOOKUP_REVAL; goto retry; } out: path_put(&old_path); return error; } SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname, int, flags) { return do_linkat(olddfd, oldname, newdfd, newname, flags); } SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname) { return do_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0); } /** * vfs_rename - rename a filesystem object * @old_dir: parent of source * @old_dentry: source * @new_dir: parent of destination * @new_dentry: destination * @delegated_inode: returns an inode needing a delegation break * @flags: rename flags * * The caller must hold multiple mutexes--see lock_rename()). * * If vfs_rename discovers a delegation in need of breaking at either * the source or destination, it will return -EWOULDBLOCK and return a * reference to the inode in delegated_inode. The caller should then * break the delegation and retry. Because breaking a delegation may * take a long time, the caller should drop all locks before doing * so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. * * The worst of all namespace operations - renaming directory. "Perverted" * doesn't even start to describe it. Somebody in UCB had a heck of a trip... * Problems: * * a) we can get into loop creation. * b) race potential - two innocent renames can create a loop together. * That's where 4.4 screws up. Current fix: serialization on * sb->s_vfs_rename_mutex. We might be more accurate, but that's another * story. * c) we have to lock _four_ objects - parents and victim (if it exists), * and source (if it is not a directory). * And that - after we got ->i_mutex on parents (until then we don't know * whether the target exists). Solution: try to be smart with locking * order for inodes. We rely on the fact that tree topology may change * only under ->s_vfs_rename_mutex _and_ that parent of the object we * move will be locked. Thus we can rank directories by the tree * (ancestors first) and rank all non-directories after them. * That works since everybody except rename does "lock parent, lookup, * lock child" and rename is under ->s_vfs_rename_mutex. * HOWEVER, it relies on the assumption that any object with ->lookup() * has no more than 1 dentry. If "hybrid" objects will ever appear, * we'd better make sure that there's no link(2) for them. * d) conversion from fhandle to dentry may come in the wrong moment - when * we are removing the target. Solution: we will have to grab ->i_mutex * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on * ->i_mutex on parents, which works but leads to some truly excessive * locking]. */ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, struct inode **delegated_inode, unsigned int flags) { int error; bool is_dir = d_is_dir(old_dentry); struct inode *source = old_dentry->d_inode; struct inode *target = new_dentry->d_inode; bool new_is_dir = false; unsigned max_links = new_dir->i_sb->s_max_links; struct name_snapshot old_name; if (source == target) return 0; error = may_delete(old_dir, old_dentry, is_dir); if (error) return error; if (!target) { error = may_create(new_dir, new_dentry); } else { new_is_dir = d_is_dir(new_dentry); if (!(flags & RENAME_EXCHANGE)) error = may_delete(new_dir, new_dentry, is_dir); else error = may_delete(new_dir, new_dentry, new_is_dir); } if (error) return error; if (!old_dir->i_op->rename) return -EPERM; /* * If we are going to change the parent - check write permissions, * we'll need to flip '..'. */ if (new_dir != old_dir) { if (is_dir) { error = inode_permission(source, MAY_WRITE); if (error) return error; } if ((flags & RENAME_EXCHANGE) && new_is_dir) { error = inode_permission(target, MAY_WRITE); if (error) return error; } } error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry, flags); if (error) return error; take_dentry_name_snapshot(&old_name, old_dentry); dget(new_dentry); if (!is_dir || (flags & RENAME_EXCHANGE)) lock_two_nondirectories(source, target); else if (target) inode_lock(target); error = -EBUSY; if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry)) goto out; if (max_links && new_dir != old_dir) { error = -EMLINK; if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links) goto out; if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir && old_dir->i_nlink >= max_links) goto out; } if (!is_dir) { error = try_break_deleg(source, delegated_inode); if (error) goto out; } if (target && !new_is_dir) { error = try_break_deleg(target, delegated_inode); if (error) goto out; } error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry, flags); if (error) goto out; if (!(flags & RENAME_EXCHANGE) && target) { if (is_dir) { shrink_dcache_parent(new_dentry); target->i_flags |= S_DEAD; } dont_mount(new_dentry); detach_mounts(new_dentry); } if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) { if (!(flags & RENAME_EXCHANGE)) d_move(old_dentry, new_dentry); else d_exchange(old_dentry, new_dentry); } out: if (!is_dir || (flags & RENAME_EXCHANGE)) unlock_two_nondirectories(source, target); else if (target) inode_unlock(target); dput(new_dentry); if (!error) { fsnotify_move(old_dir, new_dir, old_name.name, is_dir, !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry); if (flags & RENAME_EXCHANGE) { fsnotify_move(new_dir, old_dir, old_dentry->d_name.name, new_is_dir, NULL, new_dentry); } } release_dentry_name_snapshot(&old_name); return error; } EXPORT_SYMBOL(vfs_rename); static int do_renameat2(int olddfd, const char __user *oldname, int newdfd, const char __user *newname, unsigned int flags) { struct dentry *old_dentry, *new_dentry; struct dentry *trap; struct path old_path, new_path; struct qstr old_last, new_last; int old_type, new_type; struct inode *delegated_inode = NULL; struct filename *from; struct filename *to; unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET; bool should_retry = false; int error; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) && (flags & RENAME_EXCHANGE)) return -EINVAL; if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD)) return -EPERM; if (flags & RENAME_EXCHANGE) target_flags = 0; retry: from = filename_parentat(olddfd, getname(oldname), lookup_flags, &old_path, &old_last, &old_type); if (IS_ERR(from)) { error = PTR_ERR(from); goto exit; } to = filename_parentat(newdfd, getname(newname), lookup_flags, &new_path, &new_last, &new_type); if (IS_ERR(to)) { error = PTR_ERR(to); goto exit1; } error = -EXDEV; if (old_path.mnt != new_path.mnt) goto exit2; error = -EBUSY; if (old_type != LAST_NORM) goto exit2; if (flags & RENAME_NOREPLACE) error = -EEXIST; if (new_type != LAST_NORM) goto exit2; error = mnt_want_write(old_path.mnt); if (error) goto exit2; retry_deleg: trap = lock_rename(new_path.dentry, old_path.dentry); old_dentry = __lookup_hash(&old_last, old_path.dentry, lookup_flags); error = PTR_ERR(old_dentry); if (IS_ERR(old_dentry)) goto exit3; /* source must exist */ error = -ENOENT; if (d_is_negative(old_dentry)) goto exit4; new_dentry = __lookup_hash(&new_last, new_path.dentry, lookup_flags | target_flags); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto exit4; error = -EEXIST; if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry)) goto exit5; if (flags & RENAME_EXCHANGE) { error = -ENOENT; if (d_is_negative(new_dentry)) goto exit5; if (!d_is_dir(new_dentry)) { error = -ENOTDIR; if (new_last.name[new_last.len]) goto exit5; } } /* unless the source is a directory trailing slashes give -ENOTDIR */ if (!d_is_dir(old_dentry)) { error = -ENOTDIR; if (old_last.name[old_last.len]) goto exit5; if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len]) goto exit5; } /* source should not be ancestor of target */ error = -EINVAL; if (old_dentry == trap) goto exit5; /* target should not be an ancestor of source */ if (!(flags & RENAME_EXCHANGE)) error = -ENOTEMPTY; if (new_dentry == trap) goto exit5; error = security_path_rename(&old_path, old_dentry, &new_path, new_dentry, flags); if (error) goto exit5; error = vfs_rename(old_path.dentry->d_inode, old_dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode, flags); exit5: dput(new_dentry); exit4: dput(old_dentry); exit3: unlock_rename(new_path.dentry, old_path.dentry); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } mnt_drop_write(old_path.mnt); exit2: if (retry_estale(error, lookup_flags)) should_retry = true; path_put(&new_path); putname(to); exit1: path_put(&old_path); putname(from); if (should_retry) { should_retry = false; lookup_flags |= LOOKUP_REVAL; goto retry; } exit: return error; } SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname, unsigned int, flags) { return do_renameat2(olddfd, oldname, newdfd, newname, flags); } SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname) { return do_renameat2(olddfd, oldname, newdfd, newname, 0); } SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) { return do_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0); } int vfs_whiteout(struct inode *dir, struct dentry *dentry) { int error = may_create(dir, dentry); if (error) return error; if (!dir->i_op->mknod) return -EPERM; return dir->i_op->mknod(dir, dentry, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); } EXPORT_SYMBOL(vfs_whiteout); int readlink_copy(char __user *buffer, int buflen, const char *link) { int len = PTR_ERR(link); if (IS_ERR(link)) goto out; len = strlen(link); if (len > (unsigned) buflen) len = buflen; if (copy_to_user(buffer, link, len)) len = -EFAULT; out: return len; } /** * vfs_readlink - copy symlink body into userspace buffer * @dentry: dentry on which to get symbolic link * @buffer: user memory pointer * @buflen: size of buffer * * Does not touch atime. That's up to the caller if necessary * * Does not call security hook. */ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen) { struct inode *inode = d_inode(dentry); DEFINE_DELAYED_CALL(done); const char *link; int res; if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) { if (unlikely(inode->i_op->readlink)) return inode->i_op->readlink(dentry, buffer, buflen); if (!d_is_symlink(dentry)) return -EINVAL; spin_lock(&inode->i_lock); inode->i_opflags |= IOP_DEFAULT_READLINK; spin_unlock(&inode->i_lock); } link = inode->i_link; if (!link) { link = inode->i_op->get_link(dentry, inode, &done); if (IS_ERR(link)) return PTR_ERR(link); } res = readlink_copy(buffer, buflen, link); do_delayed_call(&done); return res; } EXPORT_SYMBOL(vfs_readlink); /** * vfs_get_link - get symlink body * @dentry: dentry on which to get symbolic link * @done: caller needs to free returned data with this * * Calls security hook and i_op->get_link() on the supplied inode. * * It does not touch atime. That's up to the caller if necessary. * * Does not work on "special" symlinks like /proc/$$/fd/N */ const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done) { const char *res = ERR_PTR(-EINVAL); struct inode *inode = d_inode(dentry); if (d_is_symlink(dentry)) { res = ERR_PTR(security_inode_readlink(dentry)); if (!res) res = inode->i_op->get_link(dentry, inode, done); } return res; } EXPORT_SYMBOL(vfs_get_link); /* get the link contents into pagecache */ const char *page_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *callback) { char *kaddr; struct page *page; struct address_space *mapping = inode->i_mapping; if (!dentry) { page = find_get_page(mapping, 0); if (!page) return ERR_PTR(-ECHILD); if (!PageUptodate(page)) { put_page(page); return ERR_PTR(-ECHILD); } } else { page = read_mapping_page(mapping, 0, NULL); if (IS_ERR(page)) return (char*)page; } set_delayed_call(callback, page_put_link, page); BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM); kaddr = page_address(page); nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1); return kaddr; } EXPORT_SYMBOL(page_get_link); void page_put_link(void *arg) { put_page(arg); } EXPORT_SYMBOL(page_put_link); int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) { DEFINE_DELAYED_CALL(done); int res = readlink_copy(buffer, buflen, page_get_link(dentry, d_inode(dentry), &done)); do_delayed_call(&done); return res; } EXPORT_SYMBOL(page_readlink); /* * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS */ int __page_symlink(struct inode *inode, const char *symname, int len, int nofs) { struct address_space *mapping = inode->i_mapping; struct page *page; void *fsdata = NULL; int err; unsigned int flags = 0; if (nofs) flags |= AOP_FLAG_NOFS; retry: err = pagecache_write_begin(NULL, mapping, 0, len-1, flags, &page, &fsdata); if (err) goto fail; memcpy(page_address(page), symname, len-1); err = pagecache_write_end(NULL, mapping, 0, len-1, len-1, page, fsdata); if (err < 0) goto fail; if (err < len-1) goto retry; mark_inode_dirty(inode); return 0; fail: return err; } EXPORT_SYMBOL(__page_symlink); int page_symlink(struct inode *inode, const char *symname, int len) { return __page_symlink(inode, symname, len, !mapping_gfp_constraint(inode->i_mapping, __GFP_FS)); } EXPORT_SYMBOL(page_symlink); const struct inode_operations page_symlink_inode_operations = { .get_link = page_get_link, }; EXPORT_SYMBOL(page_symlink_inode_operations); |
35 35 30 4 33 33 30 30 30 30 23 46 48 48 46 31 29 29 114 8 2 5 6 2 2 2 4 2 4 4 2 2 4 71 71 76 75 75 76 2 15 4 3 4 2 3 3 2 15 3 1 1 1 3 2 3 75 69 69 69 69 19 50 69 75 25 26 3 3 2 2 1 15 3 2 2 2 1 3 1 61 5 2 4 2 8 3 41 8 7 50 18 17 1 18 18 1 53 53 53 50 50 50 50 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds * * Added support for a Unix98-style ptmx device. * -- C. Scott Ananian <cananian@alumni.princeton.edu>, 14-Jan-1998 * */ #include <linux/module.h> #include <linux/errno.h> #include <linux/interrupt.h> #include <linux/tty.h> #include <linux/tty_flip.h> #include <linux/fcntl.h> #include <linux/sched/signal.h> #include <linux/string.h> #include <linux/major.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/device.h> #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/devpts_fs.h> #include <linux/slab.h> #include <linux/mutex.h> #include <linux/poll.h> #include <linux/mount.h> #include <linux/file.h> #include <linux/ioctl.h> #include <linux/compat.h> #undef TTY_DEBUG_HANGUP #ifdef TTY_DEBUG_HANGUP # define tty_debug_hangup(tty, f, args...) tty_debug(tty, f, ##args) #else # define tty_debug_hangup(tty, f, args...) do {} while (0) #endif #ifdef CONFIG_UNIX98_PTYS static struct tty_driver *ptm_driver; static struct tty_driver *pts_driver; static DEFINE_MUTEX(devpts_mutex); #endif static void pty_close(struct tty_struct *tty, struct file *filp) { BUG_ON(!tty); if (tty->driver->subtype == PTY_TYPE_MASTER) WARN_ON(tty->count > 1); else { if (tty_io_error(tty)) return; if (tty->count > 2) return; } set_bit(TTY_IO_ERROR, &tty->flags); wake_up_interruptible(&tty->read_wait); wake_up_interruptible(&tty->write_wait); spin_lock_irq(&tty->ctrl_lock); tty->packet = 0; spin_unlock_irq(&tty->ctrl_lock); /* Review - krefs on tty_link ?? */ if (!tty->link) return; set_bit(TTY_OTHER_CLOSED, &tty->link->flags); wake_up_interruptible(&tty->link->read_wait); wake_up_interruptible(&tty->link->write_wait); if (tty->driver->subtype == PTY_TYPE_MASTER) { set_bit(TTY_OTHER_CLOSED, &tty->flags); #ifdef CONFIG_UNIX98_PTYS if (tty->driver == ptm_driver) { mutex_lock(&devpts_mutex); if (tty->link->driver_data) devpts_pty_kill(tty->link->driver_data); mutex_unlock(&devpts_mutex); } #endif tty_vhangup(tty->link); } } /* * The unthrottle routine is called by the line discipline to signal * that it can receive more characters. For PTY's, the TTY_THROTTLED * flag is always set, to force the line discipline to always call the * unthrottle routine when there are fewer than TTY_THRESHOLD_UNTHROTTLE * characters in the queue. This is necessary since each time this * happens, we need to wake up any sleeping processes that could be * (1) trying to send data to the pty, or (2) waiting in wait_until_sent() * for the pty buffer to be drained. */ static void pty_unthrottle(struct tty_struct *tty) { tty_wakeup(tty->link); set_bit(TTY_THROTTLED, &tty->flags); } /** * pty_write - write to a pty * @tty: the tty we write from * @buf: kernel buffer of data * @count: bytes to write * * Our "hardware" write method. Data is coming from the ldisc which * may be in a non sleeping state. We simply throw this at the other * end of the link as if we were an IRQ handler receiving stuff for * the other side of the pty/tty pair. */ static int pty_write(struct tty_struct *tty, const unsigned char *buf, int c) { struct tty_struct *to = tty->link; if (tty->stopped || !c) return 0; return tty_insert_flip_string_and_push_buffer(to->port, buf, c); } /** * pty_write_room - write space * @tty: tty we are writing from * * Report how many bytes the ldisc can send into the queue for * the other device. */ static int pty_write_room(struct tty_struct *tty) { if (tty->stopped) return 0; return tty_buffer_space_avail(tty->link->port); } /** * pty_chars_in_buffer - characters currently in our tx queue * @tty: our tty * * Report how much we have in the transmit queue. As everything is * instantly at the other end this is easy to implement. */ static int pty_chars_in_buffer(struct tty_struct *tty) { return 0; } /* Set the lock flag on a pty */ static int pty_set_lock(struct tty_struct *tty, int __user *arg) { int val; if (get_user(val, arg)) return -EFAULT; if (val) set_bit(TTY_PTY_LOCK, &tty->flags); else clear_bit(TTY_PTY_LOCK, &tty->flags); return 0; } static int pty_get_lock(struct tty_struct *tty, int __user *arg) { int locked = test_bit(TTY_PTY_LOCK, &tty->flags); return put_user(locked, arg); } /* Set the packet mode on a pty */ static int pty_set_pktmode(struct tty_struct *tty, int __user *arg) { int pktmode; if (get_user(pktmode, arg)) return -EFAULT; spin_lock_irq(&tty->ctrl_lock); if (pktmode) { if (!tty->packet) { tty->link->ctrl_status = 0; smp_mb(); tty->packet = 1; } } else tty->packet = 0; spin_unlock_irq(&tty->ctrl_lock); return 0; } /* Get the packet mode of a pty */ static int pty_get_pktmode(struct tty_struct *tty, int __user *arg) { int pktmode = tty->packet; return put_user(pktmode, arg); } /* Send a signal to the slave */ static int pty_signal(struct tty_struct *tty, int sig) { struct pid *pgrp; if (sig != SIGINT && sig != SIGQUIT && sig != SIGTSTP) return -EINVAL; if (tty->link) { pgrp = tty_get_pgrp(tty->link); if (pgrp) kill_pgrp(pgrp, sig, 1); put_pid(pgrp); } return 0; } static void pty_flush_buffer(struct tty_struct *tty) { struct tty_struct *to = tty->link; if (!to) return; tty_buffer_flush(to, NULL); if (to->packet) { spin_lock_irq(&tty->ctrl_lock); tty->ctrl_status |= TIOCPKT_FLUSHWRITE; wake_up_interruptible(&to->read_wait); spin_unlock_irq(&tty->ctrl_lock); } } static int pty_open(struct tty_struct *tty, struct file *filp) { if (!tty || !tty->link) return -ENODEV; if (test_bit(TTY_OTHER_CLOSED, &tty->flags)) goto out; if (test_bit(TTY_PTY_LOCK, &tty->link->flags)) goto out; if (tty->driver->subtype == PTY_TYPE_SLAVE && tty->link->count != 1) goto out; clear_bit(TTY_IO_ERROR, &tty->flags); clear_bit(TTY_OTHER_CLOSED, &tty->link->flags); set_bit(TTY_THROTTLED, &tty->flags); return 0; out: set_bit(TTY_IO_ERROR, &tty->flags); return -EIO; } static void pty_set_termios(struct tty_struct *tty, struct ktermios *old_termios) { /* See if packet mode change of state. */ if (tty->link && tty->link->packet) { int extproc = (old_termios->c_lflag & EXTPROC) | L_EXTPROC(tty); int old_flow = ((old_termios->c_iflag & IXON) && (old_termios->c_cc[VSTOP] == '\023') && (old_termios->c_cc[VSTART] == '\021')); int new_flow = (I_IXON(tty) && STOP_CHAR(tty) == '\023' && START_CHAR(tty) == '\021'); if ((old_flow != new_flow) || extproc) { spin_lock_irq(&tty->ctrl_lock); if (old_flow != new_flow) { tty->ctrl_status &= ~(TIOCPKT_DOSTOP | TIOCPKT_NOSTOP); if (new_flow) tty->ctrl_status |= TIOCPKT_DOSTOP; else tty->ctrl_status |= TIOCPKT_NOSTOP; } if (extproc) tty->ctrl_status |= TIOCPKT_IOCTL; spin_unlock_irq(&tty->ctrl_lock); wake_up_interruptible(&tty->link->read_wait); } } tty->termios.c_cflag &= ~(CSIZE | PARENB); tty->termios.c_cflag |= (CS8 | CREAD); } /** * pty_do_resize - resize event * @tty: tty being resized * @ws: window size being set. * * Update the termios variables and send the necessary signals to * peform a terminal resize correctly */ static int pty_resize(struct tty_struct *tty, struct winsize *ws) { struct pid *pgrp, *rpgrp; struct tty_struct *pty = tty->link; /* For a PTY we need to lock the tty side */ mutex_lock(&tty->winsize_mutex); if (!memcmp(ws, &tty->winsize, sizeof(*ws))) goto done; /* Signal the foreground process group of both ptys */ pgrp = tty_get_pgrp(tty); rpgrp = tty_get_pgrp(pty); if (pgrp) kill_pgrp(pgrp, SIGWINCH, 1); if (rpgrp != pgrp && rpgrp) kill_pgrp(rpgrp, SIGWINCH, 1); put_pid(pgrp); put_pid(rpgrp); tty->winsize = *ws; pty->winsize = *ws; /* Never used so will go away soon */ done: mutex_unlock(&tty->winsize_mutex); return 0; } /** * pty_start - start() handler * pty_stop - stop() handler * @tty: tty being flow-controlled * * Propagates the TIOCPKT status to the master pty. * * NB: only the master pty can be in packet mode so only the slave * needs start()/stop() handlers */ static void pty_start(struct tty_struct *tty) { unsigned long flags; if (tty->link && tty->link->packet) { spin_lock_irqsave(&tty->ctrl_lock, flags); tty->ctrl_status &= ~TIOCPKT_STOP; tty->ctrl_status |= TIOCPKT_START; spin_unlock_irqrestore(&tty->ctrl_lock, flags); wake_up_interruptible_poll(&tty->link->read_wait, EPOLLIN); } } static void pty_stop(struct tty_struct *tty) { unsigned long flags; if (tty->link && tty->link->packet) { spin_lock_irqsave(&tty->ctrl_lock, flags); tty->ctrl_status &= ~TIOCPKT_START; tty->ctrl_status |= TIOCPKT_STOP; spin_unlock_irqrestore(&tty->ctrl_lock, flags); wake_up_interruptible_poll(&tty->link->read_wait, EPOLLIN); } } /** * pty_common_install - set up the pty pair * @driver: the pty driver * @tty: the tty being instantiated * @legacy: true if this is BSD style * * Perform the initial set up for the tty/pty pair. Called from the * tty layer when the port is first opened. * * Locking: the caller must hold the tty_mutex */ static int pty_common_install(struct tty_driver *driver, struct tty_struct *tty, bool legacy) { struct tty_struct *o_tty; struct tty_port *ports[2]; int idx = tty->index; int retval = -ENOMEM; /* Opening the slave first has always returned -EIO */ if (driver->subtype != PTY_TYPE_MASTER) return -EIO; ports[0] = kmalloc(sizeof **ports, GFP_KERNEL); ports[1] = kmalloc(sizeof **ports, GFP_KERNEL); if (!ports[0] || !ports[1]) goto err; if (!try_module_get(driver->other->owner)) { /* This cannot in fact currently happen */ goto err; } o_tty = alloc_tty_struct(driver->other, idx); if (!o_tty) goto err_put_module; tty_set_lock_subclass(o_tty); lockdep_set_subclass(&o_tty->termios_rwsem, TTY_LOCK_SLAVE); if (legacy) { /* We always use new tty termios data so we can do this the easy way .. */ tty_init_termios(tty); tty_init_termios(o_tty); driver->other->ttys[idx] = o_tty; driver->ttys[idx] = tty; } else { memset(&tty->termios_locked, 0, sizeof(tty->termios_locked)); tty->termios = driver->init_termios; memset(&o_tty->termios_locked, 0, sizeof(tty->termios_locked)); o_tty->termios = driver->other->init_termios; } /* * Everything allocated ... set up the o_tty structure. */ tty_driver_kref_get(driver->other); /* Establish the links in both directions */ tty->link = o_tty; o_tty->link = tty; tty_port_init(ports[0]); tty_port_init(ports[1]); tty_buffer_set_limit(ports[0], 8192); tty_buffer_set_limit(ports[1], 8192); o_tty->port = ports[0]; tty->port = ports[1]; o_tty->port->itty = o_tty; tty_buffer_set_lock_subclass(o_tty->port); tty_driver_kref_get(driver); tty->count++; o_tty->count++; return 0; err_put_module: module_put(driver->other->owner); err: kfree(ports[0]); kfree(ports[1]); return retval; } static void pty_cleanup(struct tty_struct *tty) { tty_port_put(tty->port); } /* Traditional BSD devices */ #ifdef CONFIG_LEGACY_PTYS static int pty_install(struct tty_driver *driver, struct tty_struct *tty) { return pty_common_install(driver, tty, true); } static void pty_remove(struct tty_driver *driver, struct tty_struct *tty) { struct tty_struct *pair = tty->link; driver->ttys[tty->index] = NULL; if (pair) pair->driver->ttys[pair->index] = NULL; } static int pty_bsd_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { switch (cmd) { case TIOCSPTLCK: /* Set PT Lock (disallow slave open) */ return pty_set_lock(tty, (int __user *) arg); case TIOCGPTLCK: /* Get PT Lock status */ return pty_get_lock(tty, (int __user *)arg); case TIOCPKT: /* Set PT packet mode */ return pty_set_pktmode(tty, (int __user *)arg); case TIOCGPKT: /* Get PT packet mode */ return pty_get_pktmode(tty, (int __user *)arg); case TIOCSIG: /* Send signal to other side of pty */ return pty_signal(tty, (int) arg); case TIOCGPTN: /* TTY returns ENOTTY, but glibc expects EINVAL here */ return -EINVAL; } return -ENOIOCTLCMD; } #ifdef CONFIG_COMPAT static long pty_bsd_compat_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { /* * PTY ioctls don't require any special translation between 32-bit and * 64-bit userspace, they are already compatible. */ return pty_bsd_ioctl(tty, cmd, (unsigned long)compat_ptr(arg)); } #else #define pty_bsd_compat_ioctl NULL #endif static int legacy_count = CONFIG_LEGACY_PTY_COUNT; /* * not really modular, but the easiest way to keep compat with existing * bootargs behaviour is to continue using module_param here. */ module_param(legacy_count, int, 0); /* * The master side of a pty can do TIOCSPTLCK and thus * has pty_bsd_ioctl. */ static const struct tty_operations master_pty_ops_bsd = { .install = pty_install, .open = pty_open, .close = pty_close, .write = pty_write, .write_room = pty_write_room, .flush_buffer = pty_flush_buffer, .chars_in_buffer = pty_chars_in_buffer, .unthrottle = pty_unthrottle, .ioctl = pty_bsd_ioctl, .compat_ioctl = pty_bsd_compat_ioctl, .cleanup = pty_cleanup, .resize = pty_resize, .remove = pty_remove }; static const struct tty_operations slave_pty_ops_bsd = { .install = pty_install, .open = pty_open, .close = pty_close, .write = pty_write, .write_room = pty_write_room, .flush_buffer = pty_flush_buffer, .chars_in_buffer = pty_chars_in_buffer, .unthrottle = pty_unthrottle, .set_termios = pty_set_termios, .cleanup = pty_cleanup, .resize = pty_resize, .start = pty_start, .stop = pty_stop, .remove = pty_remove }; static void __init legacy_pty_init(void) { struct tty_driver *pty_driver, *pty_slave_driver; if (legacy_count <= 0) return; pty_driver = tty_alloc_driver(legacy_count, TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_ALLOC); if (IS_ERR(pty_driver)) panic("Couldn't allocate pty driver"); pty_slave_driver = tty_alloc_driver(legacy_count, TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_ALLOC); if (IS_ERR(pty_slave_driver)) panic("Couldn't allocate pty slave driver"); pty_driver->driver_name = "pty_master"; pty_driver->name = "pty"; pty_driver->major = PTY_MASTER_MAJOR; pty_driver->minor_start = 0; pty_driver->type = TTY_DRIVER_TYPE_PTY; pty_driver->subtype = PTY_TYPE_MASTER; pty_driver->init_termios = tty_std_termios; pty_driver->init_termios.c_iflag = 0; pty_driver->init_termios.c_oflag = 0; pty_driver->init_termios.c_cflag = B38400 | CS8 | CREAD; pty_driver->init_termios.c_lflag = 0; pty_driver->init_termios.c_ispeed = 38400; pty_driver->init_termios.c_ospeed = 38400; pty_driver->other = pty_slave_driver; tty_set_operations(pty_driver, &master_pty_ops_bsd); pty_slave_driver->driver_name = "pty_slave"; pty_slave_driver->name = "ttyp"; pty_slave_driver->major = PTY_SLAVE_MAJOR; pty_slave_driver->minor_start = 0; pty_slave_driver->type = TTY_DRIVER_TYPE_PTY; pty_slave_driver->subtype = PTY_TYPE_SLAVE; pty_slave_driver->init_termios = tty_std_termios; pty_slave_driver->init_termios.c_cflag = B38400 | CS8 | CREAD; pty_slave_driver->init_termios.c_ispeed = 38400; pty_slave_driver->init_termios.c_ospeed = 38400; pty_slave_driver->other = pty_driver; tty_set_operations(pty_slave_driver, &slave_pty_ops_bsd); if (tty_register_driver(pty_driver)) panic("Couldn't register pty driver"); if (tty_register_driver(pty_slave_driver)) panic("Couldn't register pty slave driver"); } #else static inline void legacy_pty_init(void) { } #endif /* Unix98 devices */ #ifdef CONFIG_UNIX98_PTYS static struct cdev ptmx_cdev; /** * ptm_open_peer - open the peer of a pty * @master: the open struct file of the ptmx device node * @tty: the master of the pty being opened * @flags: the flags for open * * Provide a race free way for userspace to open the slave end of a pty * (where they have the master fd and cannot access or trust the mount * namespace /dev/pts was mounted inside). */ int ptm_open_peer(struct file *master, struct tty_struct *tty, int flags) { int fd = -1; struct file *filp; int retval = -EINVAL; struct path path; if (tty->driver != ptm_driver) return -EIO; fd = get_unused_fd_flags(flags); if (fd < 0) { retval = fd; goto err; } /* Compute the slave's path */ path.mnt = devpts_mntget(master, tty->driver_data); if (IS_ERR(path.mnt)) { retval = PTR_ERR(path.mnt); goto err_put; } path.dentry = tty->link->driver_data; filp = dentry_open(&path, flags, current_cred()); mntput(path.mnt); if (IS_ERR(filp)) { retval = PTR_ERR(filp); goto err_put; } fd_install(fd, filp); return fd; err_put: put_unused_fd(fd); err: return retval; } static int pty_unix98_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { switch (cmd) { case TIOCSPTLCK: /* Set PT Lock (disallow slave open) */ return pty_set_lock(tty, (int __user *)arg); case TIOCGPTLCK: /* Get PT Lock status */ return pty_get_lock(tty, (int __user *)arg); case TIOCPKT: /* Set PT packet mode */ return pty_set_pktmode(tty, (int __user *)arg); case TIOCGPKT: /* Get PT packet mode */ return pty_get_pktmode(tty, (int __user *)arg); case TIOCGPTN: /* Get PT Number */ return put_user(tty->index, (unsigned int __user *)arg); case TIOCSIG: /* Send signal to other side of pty */ return pty_signal(tty, (int) arg); } return -ENOIOCTLCMD; } #ifdef CONFIG_COMPAT static long pty_unix98_compat_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { /* * PTY ioctls don't require any special translation between 32-bit and * 64-bit userspace, they are already compatible. */ return pty_unix98_ioctl(tty, cmd, cmd == TIOCSIG ? arg : (unsigned long)compat_ptr(arg)); } #else #define pty_unix98_compat_ioctl NULL #endif /** * ptm_unix98_lookup - find a pty master * @driver: ptm driver * @idx: tty index * * Look up a pty master device. Called under the tty_mutex for now. * This provides our locking. */ static struct tty_struct *ptm_unix98_lookup(struct tty_driver *driver, struct file *file, int idx) { /* Master must be open via /dev/ptmx */ return ERR_PTR(-EIO); } /** * pts_unix98_lookup - find a pty slave * @driver: pts driver * @idx: tty index * * Look up a pty master device. Called under the tty_mutex for now. * This provides our locking for the tty pointer. */ static struct tty_struct *pts_unix98_lookup(struct tty_driver *driver, struct file *file, int idx) { struct tty_struct *tty; mutex_lock(&devpts_mutex); tty = devpts_get_priv(file->f_path.dentry); mutex_unlock(&devpts_mutex); /* Master must be open before slave */ if (!tty) return ERR_PTR(-EIO); return tty; } static int pty_unix98_install(struct tty_driver *driver, struct tty_struct *tty) { return pty_common_install(driver, tty, false); } /* this is called once with whichever end is closed last */ static void pty_unix98_remove(struct tty_driver *driver, struct tty_struct *tty) { struct pts_fs_info *fsi; if (tty->driver->subtype == PTY_TYPE_MASTER) fsi = tty->driver_data; else fsi = tty->link->driver_data; if (fsi) { devpts_kill_index(fsi, tty->index); devpts_release(fsi); } } static void pty_show_fdinfo(struct tty_struct *tty, struct seq_file *m) { seq_printf(m, "tty-index:\t%d\n", tty->index); } static const struct tty_operations ptm_unix98_ops = { .lookup = ptm_unix98_lookup, .install = pty_unix98_install, .remove = pty_unix98_remove, .open = pty_open, .close = pty_close, .write = pty_write, .write_room = pty_write_room, .flush_buffer = pty_flush_buffer, .chars_in_buffer = pty_chars_in_buffer, .unthrottle = pty_unthrottle, .ioctl = pty_unix98_ioctl, .compat_ioctl = pty_unix98_compat_ioctl, .resize = pty_resize, .cleanup = pty_cleanup, .show_fdinfo = pty_show_fdinfo, }; static const struct tty_operations pty_unix98_ops = { .lookup = pts_unix98_lookup, .install = pty_unix98_install, .remove = pty_unix98_remove, .open = pty_open, .close = pty_close, .write = pty_write, .write_room = pty_write_room, .flush_buffer = pty_flush_buffer, .chars_in_buffer = pty_chars_in_buffer, .unthrottle = pty_unthrottle, .set_termios = pty_set_termios, .start = pty_start, .stop = pty_stop, .cleanup = pty_cleanup, }; /** * ptmx_open - open a unix 98 pty master * @inode: inode of device file * @filp: file pointer to tty * * Allocate a unix98 pty master device from the ptmx driver. * * Locking: tty_mutex protects the init_dev work. tty->count should * protect the rest. * allocated_ptys_lock handles the list of free pty numbers */ static int ptmx_open(struct inode *inode, struct file *filp) { struct pts_fs_info *fsi; struct tty_struct *tty; struct dentry *dentry; int retval; int index; nonseekable_open(inode, filp); /* We refuse fsnotify events on ptmx, since it's a shared resource */ filp->f_mode |= FMODE_NONOTIFY; retval = tty_alloc_file(filp); if (retval) return retval; fsi = devpts_acquire(filp); if (IS_ERR(fsi)) { retval = PTR_ERR(fsi); goto out_free_file; } /* find a device that is not in use. */ mutex_lock(&devpts_mutex); index = devpts_new_index(fsi); mutex_unlock(&devpts_mutex); retval = index; if (index < 0) goto out_put_fsi; mutex_lock(&tty_mutex); tty = tty_init_dev(ptm_driver, index); /* The tty returned here is locked so we can safely drop the mutex */ mutex_unlock(&tty_mutex); retval = PTR_ERR(tty); if (IS_ERR(tty)) goto out; /* * From here on out, the tty is "live", and the index and * fsi will be killed/put by the tty_release() */ set_bit(TTY_PTY_LOCK, &tty->flags); /* LOCK THE SLAVE */ tty->driver_data = fsi; tty_add_file(tty, filp); dentry = devpts_pty_new(fsi, index, tty->link); if (IS_ERR(dentry)) { retval = PTR_ERR(dentry); goto err_release; } tty->link->driver_data = dentry; retval = ptm_driver->ops->open(tty, filp); if (retval) goto err_release; tty_debug_hangup(tty, "opening (count=%d)\n", tty->count); tty_unlock(tty); return 0; err_release: tty_unlock(tty); // This will also put-ref the fsi tty_release(inode, filp); return retval; out: devpts_kill_index(fsi, index); out_put_fsi: devpts_release(fsi); out_free_file: tty_free_file(filp); return retval; } static struct file_operations ptmx_fops __ro_after_init; static void __init unix98_pty_init(void) { ptm_driver = tty_alloc_driver(NR_UNIX98_PTY_MAX, TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV | TTY_DRIVER_DEVPTS_MEM | TTY_DRIVER_DYNAMIC_ALLOC); if (IS_ERR(ptm_driver)) panic("Couldn't allocate Unix98 ptm driver"); pts_driver = tty_alloc_driver(NR_UNIX98_PTY_MAX, TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV | TTY_DRIVER_DEVPTS_MEM | TTY_DRIVER_DYNAMIC_ALLOC); if (IS_ERR(pts_driver)) panic("Couldn't allocate Unix98 pts driver"); ptm_driver->driver_name = "pty_master"; ptm_driver->name = "ptm"; ptm_driver->major = UNIX98_PTY_MASTER_MAJOR; ptm_driver->minor_start = 0; ptm_driver->type = TTY_DRIVER_TYPE_PTY; ptm_driver->subtype = PTY_TYPE_MASTER; ptm_driver->init_termios = tty_std_termios; ptm_driver->init_termios.c_iflag = 0; ptm_driver->init_termios.c_oflag = 0; ptm_driver->init_termios.c_cflag = B38400 | CS8 | CREAD; ptm_driver->init_termios.c_lflag = 0; ptm_driver->init_termios.c_ispeed = 38400; ptm_driver->init_termios.c_ospeed = 38400; ptm_driver->other = pts_driver; tty_set_operations(ptm_driver, &ptm_unix98_ops); pts_driver->driver_name = "pty_slave"; pts_driver->name = "pts"; pts_driver->major = UNIX98_PTY_SLAVE_MAJOR; pts_driver->minor_start = 0; pts_driver->type = TTY_DRIVER_TYPE_PTY; pts_driver->subtype = PTY_TYPE_SLAVE; pts_driver->init_termios = tty_std_termios; pts_driver->init_termios.c_cflag = B38400 | CS8 | CREAD; pts_driver->init_termios.c_ispeed = 38400; pts_driver->init_termios.c_ospeed = 38400; pts_driver->other = ptm_driver; tty_set_operations(pts_driver, &pty_unix98_ops); if (tty_register_driver(ptm_driver)) panic("Couldn't register Unix98 ptm driver"); if (tty_register_driver(pts_driver)) panic("Couldn't register Unix98 pts driver"); /* Now create the /dev/ptmx special device */ tty_default_fops(&ptmx_fops); ptmx_fops.open = ptmx_open; cdev_init(&ptmx_cdev, &ptmx_fops); if (cdev_add(&ptmx_cdev, MKDEV(TTYAUX_MAJOR, 2), 1) || register_chrdev_region(MKDEV(TTYAUX_MAJOR, 2), 1, "/dev/ptmx") < 0) panic("Couldn't register /dev/ptmx driver"); device_create(tty_class, NULL, MKDEV(TTYAUX_MAJOR, 2), NULL, "ptmx"); } #else static inline void unix98_pty_init(void) { } #endif static int __init pty_init(void) { legacy_pty_init(); unix98_pty_init(); return 0; } device_initcall(pty_init); |
3 3 2 2 2 9 9 1 4 2 3 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 | /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> * Patrick Schaaf <bof@bof.de> * Martin Josefsson <gandalf@wlug.westbo.se> * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ /* Kernel module implementing an IP set type: the bitmap:ip,mac type */ #include <linux/module.h> #include <linux/ip.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/if_ether.h> #include <linux/netlink.h> #include <linux/jiffies.h> #include <linux/timer.h> #include <net/netlink.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_bitmap.h> #define IPSET_TYPE_REV_MIN 0 /* 1 Counter support added */ /* 2 Comment support added */ #define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); IP_SET_MODULE_DESC("bitmap:ip,mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_bitmap:ip,mac"); #define MTYPE bitmap_ipmac #define HOST_MASK 32 #define IP_SET_BITMAP_STORED_TIMEOUT enum { MAC_UNSET, /* element is set, without MAC */ MAC_FILLED, /* element is set with MAC */ }; /* Type structure */ struct bitmap_ipmac { unsigned long *members; /* the set members */ u32 first_ip; /* host byte order, included in range */ u32 last_ip; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ size_t memsize; /* members size */ struct timer_list gc; /* garbage collector */ struct ip_set *set; /* attached to this ip_set */ unsigned char extensions[0] /* MAC + data extensions */ __aligned(__alignof__(u64)); }; /* ADT structure for generic function args */ struct bitmap_ipmac_adt_elem { unsigned char ether[ETH_ALEN] __aligned(2); u16 id; u16 add_mac; }; struct bitmap_ipmac_elem { unsigned char ether[ETH_ALEN]; unsigned char filled; } __aligned(__alignof__(u64)); static inline u32 ip_to_id(const struct bitmap_ipmac *m, u32 ip) { return ip - m->first_ip; } #define get_elem(extensions, id, dsize) \ (struct bitmap_ipmac_elem *)(extensions + (id) * (dsize)) #define get_const_elem(extensions, id, dsize) \ (const struct bitmap_ipmac_elem *)(extensions + (id) * (dsize)) /* Common functions */ static inline int bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e, const struct bitmap_ipmac *map, size_t dsize) { const struct bitmap_ipmac_elem *elem; if (!test_bit(e->id, map->members)) return 0; elem = get_const_elem(map->extensions, e->id, dsize); if (e->add_mac && elem->filled == MAC_FILLED) return ether_addr_equal(e->ether, elem->ether); /* Trigger kernel to fill out the ethernet address */ return -EAGAIN; } static inline int bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map, size_t dsize) { const struct bitmap_ipmac_elem *elem; if (!test_bit(id, map->members)) return 0; elem = get_const_elem(map->extensions, id, dsize); /* Timer not started for the incomplete elements */ return elem->filled == MAC_FILLED; } static inline int bitmap_ipmac_is_filled(const struct bitmap_ipmac_elem *elem) { return elem->filled == MAC_FILLED; } static inline int bitmap_ipmac_add_timeout(unsigned long *timeout, const struct bitmap_ipmac_adt_elem *e, const struct ip_set_ext *ext, struct ip_set *set, struct bitmap_ipmac *map, int mode) { u32 t = ext->timeout; if (mode == IPSET_ADD_START_STORED_TIMEOUT) { if (t == set->timeout) /* Timeout was not specified, get stored one */ t = *timeout; ip_set_timeout_set(timeout, t); } else { /* If MAC is unset yet, we store plain timeout value * because the timer is not activated yet * and we can reuse it later when MAC is filled out, * possibly by the kernel */ if (e->add_mac) ip_set_timeout_set(timeout, t); else *timeout = t; } return 0; } static inline int bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e, struct bitmap_ipmac *map, u32 flags, size_t dsize) { struct bitmap_ipmac_elem *elem; elem = get_elem(map->extensions, e->id, dsize); if (test_bit(e->id, map->members)) { if (elem->filled == MAC_FILLED) { if (e->add_mac && (flags & IPSET_FLAG_EXIST) && !ether_addr_equal(e->ether, elem->ether)) { /* memcpy isn't atomic */ clear_bit(e->id, map->members); smp_mb__after_atomic(); ether_addr_copy(elem->ether, e->ether); } return IPSET_ADD_FAILED; } else if (!e->add_mac) /* Already added without ethernet address */ return IPSET_ADD_FAILED; /* Fill the MAC address and trigger the timer activation */ clear_bit(e->id, map->members); smp_mb__after_atomic(); ether_addr_copy(elem->ether, e->ether); elem->filled = MAC_FILLED; return IPSET_ADD_START_STORED_TIMEOUT; } else if (e->add_mac) { /* We can store MAC too */ ether_addr_copy(elem->ether, e->ether); elem->filled = MAC_FILLED; return 0; } elem->filled = MAC_UNSET; /* MAC is not stored yet, don't start timer */ return IPSET_ADD_STORE_PLAIN_TIMEOUT; } static inline int bitmap_ipmac_do_del(const struct bitmap_ipmac_adt_elem *e, struct bitmap_ipmac *map) { return !test_and_clear_bit(e->id, map->members); } static inline int bitmap_ipmac_do_list(struct sk_buff *skb, const struct bitmap_ipmac *map, u32 id, size_t dsize) { const struct bitmap_ipmac_elem *elem = get_const_elem(map->extensions, id, dsize); return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip + id)) || (elem->filled == MAC_FILLED && nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, elem->ether)); } static inline int bitmap_ipmac_do_head(struct sk_buff *skb, const struct bitmap_ipmac *map) { return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) || nla_put_ipaddr4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)); } static int bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { struct bitmap_ipmac *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct bitmap_ipmac_adt_elem e = { .id = 0, .add_mac = 1 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); u32 ip; ip = ntohl(ip4addr(skb, opt->flags & IPSET_DIM_ONE_SRC)); if (ip < map->first_ip || ip > map->last_ip) return -IPSET_ERR_BITMAP_RANGE; /* Backward compatibility: we don't check the second flag */ if (skb_mac_header(skb) < skb->head || (skb_mac_header(skb) + ETH_HLEN) > skb->data) return -EINVAL; e.id = ip_to_id(map, ip); if (opt->flags & IPSET_DIM_TWO_SRC) ether_addr_copy(e.ether, eth_hdr(skb)->h_source); else ether_addr_copy(e.ether, eth_hdr(skb)->h_dest); if (is_zero_ether_addr(e.ether)) return -EINVAL; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct bitmap_ipmac *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct bitmap_ipmac_adt_elem e = { .id = 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0; int ret = 0; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP])) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (ip < map->first_ip || ip > map->last_ip) return -IPSET_ERR_BITMAP_RANGE; e.id = ip_to_id(map, ip); if (tb[IPSET_ATTR_ETHER]) { if (nla_len(tb[IPSET_ATTR_ETHER]) != ETH_ALEN) return -IPSET_ERR_PROTOCOL; memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN); e.add_mac = 1; } ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } static bool bitmap_ipmac_same_set(const struct ip_set *a, const struct ip_set *b) { const struct bitmap_ipmac *x = a->data; const struct bitmap_ipmac *y = b->data; return x->first_ip == y->first_ip && x->last_ip == y->last_ip && a->timeout == b->timeout && a->extensions == b->extensions; } /* Plain variant */ #include "ip_set_bitmap_gen.h" /* Create bitmap:ip,mac type of sets */ static bool init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map, u32 first_ip, u32 last_ip, u32 elements) { map->members = bitmap_zalloc(elements, GFP_KERNEL | __GFP_NOWARN); if (!map->members) return false; map->first_ip = first_ip; map->last_ip = last_ip; map->elements = elements; set->timeout = IPSET_NO_TIMEOUT; map->set = set; set->data = map; set->family = NFPROTO_IPV4; return true; } static int bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[], u32 flags) { u32 first_ip = 0, last_ip = 0; u64 elements; struct bitmap_ipmac *map; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &first_ip); if (ret) return ret; if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &last_ip); if (ret) return ret; if (first_ip > last_ip) swap(first_ip, last_ip); } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (cidr >= HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(first_ip, last_ip, cidr); } else { return -IPSET_ERR_PROTOCOL; } elements = (u64)last_ip - first_ip + 1; if (elements > IPSET_BITMAP_MAX_RANGE + 1) return -IPSET_ERR_BITMAP_RANGE_SIZE; set->dsize = ip_set_elem_len(set, tb, sizeof(struct bitmap_ipmac_elem), __alignof__(struct bitmap_ipmac_elem)); map = ip_set_alloc(sizeof(*map) + elements * set->dsize); if (!map) return -ENOMEM; map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long); set->variant = &bitmap_ipmac; if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) { kfree(map); return -ENOMEM; } if (tb[IPSET_ATTR_TIMEOUT]) { set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); bitmap_ipmac_gc_init(set, bitmap_ipmac_gc); } return 0; } static struct ip_set_type bitmap_ipmac_type = { .name = "bitmap:ip,mac", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP | IPSET_TYPE_MAC, .dimension = IPSET_DIM_TWO, .family = NFPROTO_IPV4, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create = bitmap_ipmac_create, .create_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_ETHER] = { .type = NLA_BINARY, .len = ETH_ALEN }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init bitmap_ipmac_init(void) { return ip_set_type_register(&bitmap_ipmac_type); } static void __exit bitmap_ipmac_fini(void) { rcu_barrier(); ip_set_type_unregister(&bitmap_ipmac_type); } module_init(bitmap_ipmac_init); module_exit(bitmap_ipmac_fini); |
1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 | /* * linux/fs/nls/mac-cyrillic.c * * Charset maccyrillic translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ /* * COPYRIGHT AND PERMISSION NOTICE * * Copyright 1991-2012 Unicode, Inc. All rights reserved. Distributed under * the Terms of Use in http://www.unicode.org/copyright.html. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of the Unicode data files and any associated documentation (the "Data * Files") or Unicode software and any associated documentation (the * "Software") to deal in the Data Files or Software without restriction, * including without limitation the rights to use, copy, modify, merge, * publish, distribute, and/or sell copies of the Data Files or Software, and * to permit persons to whom the Data Files or Software are furnished to do * so, provided that (a) the above copyright notice(s) and this permission * notice appear with all copies of the Data Files or Software, (b) both the * above copyright notice(s) and this permission notice appear in associated * documentation, and (c) there is clear notice in each modified Data File or * in the Software as well as in the documentation associated with the Data * File(s) or Software that the data or software has been modified. * * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF * THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR * PERFORMANCE OF THE DATA FILES OR SOFTWARE. * * Except as contained in this notice, the name of a copyright holder shall * not be used in advertising or otherwise to promote the sale, use or other * dealings in these Data Files or Software without prior written * authorization of the copyright holder. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00 */ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10 */ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20 */ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30 */ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40 */ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50 */ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60 */ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70 */ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80 */ 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f, /* 0x90 */ 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f, /* 0xa0 */ 0x2020, 0x00b0, 0x0490, 0x00a3, 0x00a7, 0x2022, 0x00b6, 0x0406, 0x00ae, 0x00a9, 0x2122, 0x0402, 0x0452, 0x2260, 0x0403, 0x0453, /* 0xb0 */ 0x221e, 0x00b1, 0x2264, 0x2265, 0x0456, 0x00b5, 0x0491, 0x0408, 0x0404, 0x0454, 0x0407, 0x0457, 0x0409, 0x0459, 0x040a, 0x045a, /* 0xc0 */ 0x0458, 0x0405, 0x00ac, 0x221a, 0x0192, 0x2248, 0x2206, 0x00ab, 0x00bb, 0x2026, 0x00a0, 0x040b, 0x045b, 0x040c, 0x045c, 0x0455, /* 0xd0 */ 0x2013, 0x2014, 0x201c, 0x201d, 0x2018, 0x2019, 0x00f7, 0x201e, 0x040e, 0x045e, 0x040f, 0x045f, 0x2116, 0x0401, 0x0451, 0x044f, /* 0xe0 */ 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f, /* 0xf0 */ 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, 0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x20ac, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xca, 0x00, 0x00, 0xa3, 0x00, 0x00, 0x00, 0xa4, /* 0xa0-0xa7 */ 0x00, 0xa9, 0x00, 0xc7, 0xc2, 0x00, 0xa8, 0x00, /* 0xa8-0xaf */ 0xa1, 0xb1, 0x00, 0x00, 0x00, 0xb5, 0xa6, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0xc8, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xd6, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page01[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page04[256] = { 0x00, 0xdd, 0xab, 0xae, 0xb8, 0xc1, 0xa7, 0xba, /* 0x00-0x07 */ 0xb7, 0xbc, 0xbe, 0xcb, 0xcd, 0x00, 0xd8, 0xda, /* 0x08-0x0f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x10-0x17 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x18-0x1f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x20-0x27 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x28-0x2f */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x30-0x37 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x38-0x3f */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0x40-0x47 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0x48-0x4f */ 0x00, 0xde, 0xac, 0xaf, 0xb9, 0xcf, 0xb4, 0xbb, /* 0x50-0x57 */ 0xc0, 0xbd, 0xbf, 0xcc, 0xce, 0x00, 0xd9, 0xdb, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0xa2, 0xb6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page20[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0xd0, 0xd1, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0xd4, 0xd5, 0x00, 0x00, 0xd2, 0xd3, 0xd7, 0x00, /* 0x18-0x1f */ 0xa0, 0x00, 0xa5, 0x00, 0x00, 0x00, 0xc9, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page21[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xdc, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page22[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc6, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, 0xb0, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0xc5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0xad, 0x00, 0x00, 0x00, 0xb2, 0xb3, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char *const page_uni2charset[256] = { page00, page01, NULL, NULL, page04, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page20, page21, page22, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "maccyrillic", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_maccyrillic(void) { return register_nls(&table); } static void __exit exit_nls_maccyrillic(void) { unregister_nls(&table); } module_init(init_nls_maccyrillic) module_exit(exit_nls_maccyrillic) MODULE_LICENSE("Dual BSD/GPL"); |
1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 | /* * VMware VMCI Driver * * Copyright (C) 2012 VMware, Inc. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation version 2 and no later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. */ #ifndef _VMCI_QUEUE_PAIR_H_ #define _VMCI_QUEUE_PAIR_H_ #include <linux/vmw_vmci_defs.h> #include <linux/types.h> #include "vmci_context.h" /* Callback needed for correctly waiting on events. */ typedef int (*vmci_event_release_cb) (void *client_data); /* Guest device port I/O. */ struct ppn_set { u64 num_produce_pages; u64 num_consume_pages; u32 *produce_ppns; u32 *consume_ppns; bool initialized; }; /* VMCIqueue_pairAllocInfo */ struct vmci_qp_alloc_info { struct vmci_handle handle; u32 peer; u32 flags; u64 produce_size; u64 consume_size; u64 ppn_va; /* Start VA of queue pair PPNs. */ u64 num_ppns; s32 result; u32 version; }; /* VMCIqueue_pairSetVAInfo */ struct vmci_qp_set_va_info { struct vmci_handle handle; u64 va; /* Start VA of queue pair PPNs. */ u64 num_ppns; u32 version; s32 result; }; /* * For backwards compatibility, here is a version of the * VMCIqueue_pairPageFileInfo before host support end-points was added. * Note that the current version of that structure requires VMX to * pass down the VA of the mapped file. Before host support was added * there was nothing of the sort. So, when the driver sees the ioctl * with a parameter that is the sizeof * VMCIqueue_pairPageFileInfo_NoHostQP then it can infer that the version * of VMX running can't attach to host end points because it doesn't * provide the VA of the mapped files. * * The Linux driver doesn't get an indication of the size of the * structure passed down from user space. So, to fix a long standing * but unfiled bug, the _pad field has been renamed to version. * Existing versions of VMX always initialize the PageFileInfo * structure so that _pad, er, version is set to 0. * * A version value of 1 indicates that the size of the structure has * been increased to include two UVA's: produce_uva and consume_uva. * These UVA's are of the mmap()'d queue contents backing files. * * In addition, if when VMX is sending down the * VMCIqueue_pairPageFileInfo structure it gets an error then it will * try again with the _NoHostQP version of the file to see if an older * VMCI kernel module is running. */ /* VMCIqueue_pairPageFileInfo */ struct vmci_qp_page_file_info { struct vmci_handle handle; u64 produce_page_file; /* User VA. */ u64 consume_page_file; /* User VA. */ u64 produce_page_file_size; /* Size of the file name array. */ u64 consume_page_file_size; /* Size of the file name array. */ s32 result; u32 version; /* Was _pad. */ u64 produce_va; /* User VA of the mapped file. */ u64 consume_va; /* User VA of the mapped file. */ }; /* vmci queuepair detach info */ struct vmci_qp_dtch_info { struct vmci_handle handle; s32 result; u32 _pad; }; /* * struct vmci_qp_page_store describes how the memory of a given queue pair * is backed. When the queue pair is between the host and a guest, the * page store consists of references to the guest pages. On vmkernel, * this is a list of PPNs, and on hosted, it is a user VA where the * queue pair is mapped into the VMX address space. */ struct vmci_qp_page_store { /* Reference to pages backing the queue pair. */ u64 pages; /* Length of pageList/virtual addres range (in pages). */ u32 len; }; /* * This data type contains the information about a queue. * There are two queues (hence, queue pairs) per transaction model between a * pair of end points, A & B. One queue is used by end point A to transmit * commands and responses to B. The other queue is used by B to transmit * commands and responses. * * struct vmci_queue_kern_if is a per-OS defined Queue structure. It contains * either a direct pointer to the linear address of the buffer contents or a * pointer to structures which help the OS locate those data pages. See * vmciKernelIf.c for each platform for its definition. */ struct vmci_queue { struct vmci_queue_header *q_header; struct vmci_queue_header *saved_header; struct vmci_queue_kern_if *kernel_if; }; /* * Utility function that checks whether the fields of the page * store contain valid values. * Result: * true if the page store is wellformed. false otherwise. */ static inline bool VMCI_QP_PAGESTORE_IS_WELLFORMED(struct vmci_qp_page_store *page_store) { return page_store->len >= 2; } void vmci_qp_broker_exit(void); int vmci_qp_broker_alloc(struct vmci_handle handle, u32 peer, u32 flags, u32 priv_flags, u64 produce_size, u64 consume_size, struct vmci_qp_page_store *page_store, struct vmci_ctx *context); int vmci_qp_broker_set_page_store(struct vmci_handle handle, u64 produce_uva, u64 consume_uva, struct vmci_ctx *context); int vmci_qp_broker_detach(struct vmci_handle handle, struct vmci_ctx *context); void vmci_qp_guest_endpoints_exit(void); int vmci_qp_alloc(struct vmci_handle *handle, struct vmci_queue **produce_q, u64 produce_size, struct vmci_queue **consume_q, u64 consume_size, u32 peer, u32 flags, u32 priv_flags, bool guest_endpoint, vmci_event_release_cb wakeup_cb, void *client_data); int vmci_qp_broker_map(struct vmci_handle handle, struct vmci_ctx *context, u64 guest_mem); int vmci_qp_broker_unmap(struct vmci_handle handle, struct vmci_ctx *context, u32 gid); #endif /* _VMCI_QUEUE_PAIR_H_ */ |
2 2 4 4 4 4 2 455 454 30 454 454 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 | /* * IPVS: Locality-Based Least-Connection with Replication scheduler * * Authors: Wensong Zhang <wensong@gnuchina.org> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * * Changes: * Julian Anastasov : Added the missing (dest->weight>0) * condition in the ip_vs_dest_set_max. * */ /* * The lblc/r algorithm is as follows (pseudo code): * * if serverSet[dest_ip] is null then * n, serverSet[dest_ip] <- {weighted least-conn node}; * else * n <- {least-conn (alive) node in serverSet[dest_ip]}; * if (n is null) OR * (n.conns>n.weight AND * there is a node m with m.conns<m.weight/2) then * n <- {weighted least-conn node}; * add n to serverSet[dest_ip]; * if |serverSet[dest_ip]| > 1 AND * now - serverSet[dest_ip].lastMod > T then * m <- {most conn node in serverSet[dest_ip]}; * remove m from serverSet[dest_ip]; * if serverSet[dest_ip] changed then * serverSet[dest_ip].lastMod <- now; * * return n; * */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/ip.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/jiffies.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/hash.h> /* for sysctl */ #include <linux/fs.h> #include <linux/sysctl.h> #include <net/net_namespace.h> #include <net/ip_vs.h> /* * It is for garbage collection of stale IPVS lblcr entries, * when the table is full. */ #define CHECK_EXPIRE_INTERVAL (60*HZ) #define ENTRY_TIMEOUT (6*60*HZ) #define DEFAULT_EXPIRATION (24*60*60*HZ) /* * It is for full expiration check. * When there is no partial expiration check (garbage collection) * in a half hour, do a full expiration check to collect stale * entries that haven't been touched for a day. */ #define COUNT_FOR_FULL_EXPIRATION 30 /* * for IPVS lblcr entry hash table */ #ifndef CONFIG_IP_VS_LBLCR_TAB_BITS #define CONFIG_IP_VS_LBLCR_TAB_BITS 10 #endif #define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS #define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS) #define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1) /* * IPVS destination set structure and operations */ struct ip_vs_dest_set_elem { struct list_head list; /* list link */ struct ip_vs_dest *dest; /* destination server */ struct rcu_head rcu_head; }; struct ip_vs_dest_set { atomic_t size; /* set size */ unsigned long lastmod; /* last modified time */ struct list_head list; /* destination list */ }; static void ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest, bool check) { struct ip_vs_dest_set_elem *e; if (check) { list_for_each_entry(e, &set->list, list) { if (e->dest == dest) return; } } e = kmalloc(sizeof(*e), GFP_ATOMIC); if (e == NULL) return; ip_vs_dest_hold(dest); e->dest = dest; list_add_rcu(&e->list, &set->list); atomic_inc(&set->size); set->lastmod = jiffies; } static void ip_vs_lblcr_elem_rcu_free(struct rcu_head *head) { struct ip_vs_dest_set_elem *e; e = container_of(head, struct ip_vs_dest_set_elem, rcu_head); ip_vs_dest_put_and_free(e->dest); kfree(e); } static void ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) { struct ip_vs_dest_set_elem *e; list_for_each_entry(e, &set->list, list) { if (e->dest == dest) { /* HIT */ atomic_dec(&set->size); set->lastmod = jiffies; list_del_rcu(&e->list); call_rcu(&e->rcu_head, ip_vs_lblcr_elem_rcu_free); break; } } } static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) { struct ip_vs_dest_set_elem *e, *ep; list_for_each_entry_safe(e, ep, &set->list, list) { list_del_rcu(&e->list); call_rcu(&e->rcu_head, ip_vs_lblcr_elem_rcu_free); } } /* get weighted least-connection node in the destination set */ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) { register struct ip_vs_dest_set_elem *e; struct ip_vs_dest *dest, *least; int loh, doh; /* select the first destination server, whose weight > 0 */ list_for_each_entry_rcu(e, &set->list, list) { least = e->dest; if (least->flags & IP_VS_DEST_F_OVERLOAD) continue; if ((atomic_read(&least->weight) > 0) && (least->flags & IP_VS_DEST_F_AVAILABLE)) { loh = ip_vs_dest_conn_overhead(least); goto nextstage; } } return NULL; /* find the destination with the weighted least load */ nextstage: list_for_each_entry_continue_rcu(e, &set->list, list) { dest = e->dest; if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; doh = ip_vs_dest_conn_overhead(dest); if (((__s64)loh * atomic_read(&dest->weight) > (__s64)doh * atomic_read(&least->weight)) && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { least = dest; loh = doh; } } IP_VS_DBG_BUF(6, "%s(): server %s:%d " "activeconns %d refcnt %d weight %d overhead %d\n", __func__, IP_VS_DBG_ADDR(least->af, &least->addr), ntohs(least->port), atomic_read(&least->activeconns), refcount_read(&least->refcnt), atomic_read(&least->weight), loh); return least; } /* get weighted most-connection node in the destination set */ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) { register struct ip_vs_dest_set_elem *e; struct ip_vs_dest *dest, *most; int moh, doh; if (set == NULL) return NULL; /* select the first destination server, whose weight > 0 */ list_for_each_entry(e, &set->list, list) { most = e->dest; if (atomic_read(&most->weight) > 0) { moh = ip_vs_dest_conn_overhead(most); goto nextstage; } } return NULL; /* find the destination with the weighted most load */ nextstage: list_for_each_entry_continue(e, &set->list, list) { dest = e->dest; doh = ip_vs_dest_conn_overhead(dest); /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */ if (((__s64)moh * atomic_read(&dest->weight) < (__s64)doh * atomic_read(&most->weight)) && (atomic_read(&dest->weight) > 0)) { most = dest; moh = doh; } } IP_VS_DBG_BUF(6, "%s(): server %s:%d " "activeconns %d refcnt %d weight %d overhead %d\n", __func__, IP_VS_DBG_ADDR(most->af, &most->addr), ntohs(most->port), atomic_read(&most->activeconns), refcount_read(&most->refcnt), atomic_read(&most->weight), moh); return most; } /* * IPVS lblcr entry represents an association between destination * IP address and its destination server set */ struct ip_vs_lblcr_entry { struct hlist_node list; int af; /* address family */ union nf_inet_addr addr; /* destination IP address */ struct ip_vs_dest_set set; /* destination server set */ unsigned long lastuse; /* last used time */ struct rcu_head rcu_head; }; /* * IPVS lblcr hash table */ struct ip_vs_lblcr_table { struct rcu_head rcu_head; struct hlist_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ atomic_t entries; /* number of entries */ int max_size; /* maximum size of entries */ struct timer_list periodic_timer; /* collect stale entries */ struct ip_vs_service *svc; /* pointer back to service */ int rover; /* rover for expire check */ int counter; /* counter for no expire */ bool dead; }; #ifdef CONFIG_SYSCTL /* * IPVS LBLCR sysctl table */ static struct ctl_table vs_vars_table[] = { { .procname = "lblcr_expiration", .data = NULL, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { } }; #endif static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) { hlist_del_rcu(&en->list); ip_vs_dest_set_eraseall(&en->set); kfree_rcu(en, rcu_head); } /* * Returns hash value for IPVS LBLCR entry */ static inline unsigned int ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr) { __be32 addr_fold = addr->ip; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif return hash_32(ntohl(addr_fold), IP_VS_LBLCR_TAB_BITS); } /* * Hash an entry in the ip_vs_lblcr_table. * returns bool success. */ static void ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) { unsigned int hash = ip_vs_lblcr_hashkey(en->af, &en->addr); hlist_add_head_rcu(&en->list, &tbl->bucket[hash]); atomic_inc(&tbl->entries); } /* Get ip_vs_lblcr_entry associated with supplied parameters. */ static inline struct ip_vs_lblcr_entry * ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *addr) { unsigned int hash = ip_vs_lblcr_hashkey(af, addr); struct ip_vs_lblcr_entry *en; hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list) if (ip_vs_addr_equal(af, &en->addr, addr)) return en; return NULL; } /* * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination * IP address to a server. Called under spin lock. */ static inline struct ip_vs_lblcr_entry * ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr, u16 af, struct ip_vs_dest *dest) { struct ip_vs_lblcr_entry *en; en = ip_vs_lblcr_get(af, tbl, daddr); if (!en) { en = kmalloc(sizeof(*en), GFP_ATOMIC); if (!en) return NULL; en->af = af; ip_vs_addr_copy(af, &en->addr, daddr); en->lastuse = jiffies; /* initialize its dest set */ atomic_set(&(en->set.size), 0); INIT_LIST_HEAD(&en->set.list); ip_vs_dest_set_insert(&en->set, dest, false); ip_vs_lblcr_hash(tbl, en); return en; } ip_vs_dest_set_insert(&en->set, dest, true); return en; } /* * Flush all the entries of the specified table. */ static void ip_vs_lblcr_flush(struct ip_vs_service *svc) { struct ip_vs_lblcr_table *tbl = svc->sched_data; int i; struct ip_vs_lblcr_entry *en; struct hlist_node *next; spin_lock_bh(&svc->sched_lock); tbl->dead = true; for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) { hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) { ip_vs_lblcr_free(en); } } spin_unlock_bh(&svc->sched_lock); } static int sysctl_lblcr_expiration(struct ip_vs_service *svc) { #ifdef CONFIG_SYSCTL return svc->ipvs->sysctl_lblcr_expiration; #else return DEFAULT_EXPIRATION; #endif } static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) { struct ip_vs_lblcr_table *tbl = svc->sched_data; unsigned long now = jiffies; int i, j; struct ip_vs_lblcr_entry *en; struct hlist_node *next; for (i = 0, j = tbl->rover; i < IP_VS_LBLCR_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLCR_TAB_MASK; spin_lock(&svc->sched_lock); hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { if (time_after(en->lastuse + sysctl_lblcr_expiration(svc), now)) continue; ip_vs_lblcr_free(en); atomic_dec(&tbl->entries); } spin_unlock(&svc->sched_lock); } tbl->rover = j; } /* * Periodical timer handler for IPVS lblcr table * It is used to collect stale entries when the number of entries * exceeds the maximum size of the table. * * Fixme: we probably need more complicated algorithm to collect * entries that have not been used for a long time even * if the number of entries doesn't exceed the maximum size * of the table. * The full expiration check is for this purpose now. */ static void ip_vs_lblcr_check_expire(struct timer_list *t) { struct ip_vs_lblcr_table *tbl = from_timer(tbl, t, periodic_timer); struct ip_vs_service *svc = tbl->svc; unsigned long now = jiffies; int goal; int i, j; struct ip_vs_lblcr_entry *en; struct hlist_node *next; if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { /* do full expiration check */ ip_vs_lblcr_full_check(svc); tbl->counter = 1; goto out; } if (atomic_read(&tbl->entries) <= tbl->max_size) { tbl->counter++; goto out; } goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; if (goal > tbl->max_size/2) goal = tbl->max_size/2; for (i = 0, j = tbl->rover; i < IP_VS_LBLCR_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLCR_TAB_MASK; spin_lock(&svc->sched_lock); hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { if (time_before(now, en->lastuse+ENTRY_TIMEOUT)) continue; ip_vs_lblcr_free(en); atomic_dec(&tbl->entries); goal--; } spin_unlock(&svc->sched_lock); if (goal <= 0) break; } tbl->rover = j; out: mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); } static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) { int i; struct ip_vs_lblcr_table *tbl; /* * Allocate the ip_vs_lblcr_table for this service */ tbl = kmalloc(sizeof(*tbl), GFP_KERNEL); if (tbl == NULL) return -ENOMEM; svc->sched_data = tbl; IP_VS_DBG(6, "LBLCR hash table (memory=%zdbytes) allocated for " "current service\n", sizeof(*tbl)); /* * Initialize the hash buckets */ for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) { INIT_HLIST_HEAD(&tbl->bucket[i]); } tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; tbl->rover = 0; tbl->counter = 1; tbl->dead = false; tbl->svc = svc; atomic_set(&tbl->entries, 0); /* * Hook periodic timer for garbage collection */ timer_setup(&tbl->periodic_timer, ip_vs_lblcr_check_expire, 0); mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); return 0; } static void ip_vs_lblcr_done_svc(struct ip_vs_service *svc) { struct ip_vs_lblcr_table *tbl = svc->sched_data; /* remove periodic timer */ del_timer_sync(&tbl->periodic_timer); /* got to clean up table entries here */ ip_vs_lblcr_flush(svc); /* release the table itself */ kfree_rcu(tbl, rcu_head); IP_VS_DBG(6, "LBLCR hash table (memory=%zdbytes) released\n", sizeof(*tbl)); } static inline struct ip_vs_dest * __ip_vs_lblcr_schedule(struct ip_vs_service *svc) { struct ip_vs_dest *dest, *least; int loh, doh; /* * We use the following formula to estimate the load: * (dest overhead) / dest->weight * * Remember -- no floats in kernel mode!!! * The comparison of h1*w2 > h2*w1 is equivalent to that of * h1/w1 > h2/w2 * if every weight is larger than zero. * * The server with weight=0 is quiesced and will not receive any * new connection. */ list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; if (atomic_read(&dest->weight) > 0) { least = dest; loh = ip_vs_dest_conn_overhead(least); goto nextstage; } } return NULL; /* * Find the destination with the least load. */ nextstage: list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; doh = ip_vs_dest_conn_overhead(dest); if ((__s64)loh * atomic_read(&dest->weight) > (__s64)doh * atomic_read(&least->weight)) { least = dest; loh = doh; } } IP_VS_DBG_BUF(6, "LBLCR: server %s:%d " "activeconns %d refcnt %d weight %d overhead %d\n", IP_VS_DBG_ADDR(least->af, &least->addr), ntohs(least->port), atomic_read(&least->activeconns), refcount_read(&least->refcnt), atomic_read(&least->weight), loh); return least; } /* * If this destination server is overloaded and there is a less loaded * server, then return true. */ static inline int is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) { if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { struct ip_vs_dest *d; list_for_each_entry_rcu(d, &svc->destinations, n_list) { if (atomic_read(&d->activeconns)*2 < atomic_read(&d->weight)) { return 1; } } } return 0; } /* * Locality-Based (weighted) Least-Connection scheduling */ static struct ip_vs_dest * ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, struct ip_vs_iphdr *iph) { struct ip_vs_lblcr_table *tbl = svc->sched_data; struct ip_vs_dest *dest; struct ip_vs_lblcr_entry *en; IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); /* First look in our cache */ en = ip_vs_lblcr_get(svc->af, tbl, &iph->daddr); if (en) { en->lastuse = jiffies; /* Get the least loaded destination */ dest = ip_vs_dest_set_min(&en->set); /* More than one destination + enough time passed by, cleanup */ if (atomic_read(&en->set.size) > 1 && time_after(jiffies, en->set.lastmod + sysctl_lblcr_expiration(svc))) { spin_lock_bh(&svc->sched_lock); if (atomic_read(&en->set.size) > 1) { struct ip_vs_dest *m; m = ip_vs_dest_set_max(&en->set); if (m) ip_vs_dest_set_erase(&en->set, m); } spin_unlock_bh(&svc->sched_lock); } /* If the destination is not overloaded, use it */ if (dest && !is_overloaded(dest, svc)) goto out; /* The cache entry is invalid, time to schedule */ dest = __ip_vs_lblcr_schedule(svc); if (!dest) { ip_vs_scheduler_err(svc, "no destination available"); return NULL; } /* Update our cache entry */ spin_lock_bh(&svc->sched_lock); if (!tbl->dead) ip_vs_dest_set_insert(&en->set, dest, true); spin_unlock_bh(&svc->sched_lock); goto out; } /* No cache entry, time to schedule */ dest = __ip_vs_lblcr_schedule(svc); if (!dest) { IP_VS_DBG(1, "no destination available\n"); return NULL; } /* If we fail to create a cache entry, we'll just use the valid dest */ spin_lock_bh(&svc->sched_lock); if (!tbl->dead) ip_vs_lblcr_new(tbl, &iph->daddr, svc->af, dest); spin_unlock_bh(&svc->sched_lock); out: IP_VS_DBG_BUF(6, "LBLCR: destination IP address %s --> server %s:%d\n", IP_VS_DBG_ADDR(svc->af, &iph->daddr), IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); return dest; } /* * IPVS LBLCR Scheduler structure */ static struct ip_vs_scheduler ip_vs_lblcr_scheduler = { .name = "lblcr", .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list), .init_service = ip_vs_lblcr_init_svc, .done_service = ip_vs_lblcr_done_svc, .schedule = ip_vs_lblcr_schedule, }; /* * per netns init. */ #ifdef CONFIG_SYSCTL static int __net_init __ip_vs_lblcr_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); if (!ipvs) return -ENOENT; if (!net_eq(net, &init_net)) { ipvs->lblcr_ctl_table = kmemdup(vs_vars_table, sizeof(vs_vars_table), GFP_KERNEL); if (ipvs->lblcr_ctl_table == NULL) return -ENOMEM; /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) ipvs->lblcr_ctl_table[0].procname = NULL; } else ipvs->lblcr_ctl_table = vs_vars_table; ipvs->sysctl_lblcr_expiration = DEFAULT_EXPIRATION; ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration; ipvs->lblcr_ctl_header = register_net_sysctl(net, "net/ipv4/vs", ipvs->lblcr_ctl_table); if (!ipvs->lblcr_ctl_header) { if (!net_eq(net, &init_net)) kfree(ipvs->lblcr_ctl_table); return -ENOMEM; } return 0; } static void __net_exit __ip_vs_lblcr_exit(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); unregister_net_sysctl_table(ipvs->lblcr_ctl_header); if (!net_eq(net, &init_net)) kfree(ipvs->lblcr_ctl_table); } #else static int __net_init __ip_vs_lblcr_init(struct net *net) { return 0; } static void __net_exit __ip_vs_lblcr_exit(struct net *net) { } #endif static struct pernet_operations ip_vs_lblcr_ops = { .init = __ip_vs_lblcr_init, .exit = __ip_vs_lblcr_exit, }; static int __init ip_vs_lblcr_init(void) { int ret; ret = register_pernet_subsys(&ip_vs_lblcr_ops); if (ret) return ret; ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler); if (ret) unregister_pernet_subsys(&ip_vs_lblcr_ops); return ret; } static void __exit ip_vs_lblcr_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler); unregister_pernet_subsys(&ip_vs_lblcr_ops); rcu_barrier(); } module_init(ip_vs_lblcr_init); module_exit(ip_vs_lblcr_cleanup); MODULE_LICENSE("GPL"); |
9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 | /* RomFS internal definitions * * Copyright © 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ #include <linux/romfs_fs.h> struct romfs_inode_info { struct inode vfs_inode; unsigned long i_metasize; /* size of non-data area */ unsigned long i_dataoffset; /* from the start of fs */ }; static inline size_t romfs_maxsize(struct super_block *sb) { return (size_t) (unsigned long) sb->s_fs_info; } static inline struct romfs_inode_info *ROMFS_I(struct inode *inode) { return container_of(inode, struct romfs_inode_info, vfs_inode); } /* * mmap-nommu.c */ #if !defined(CONFIG_MMU) && defined(CONFIG_ROMFS_ON_MTD) extern const struct file_operations romfs_ro_fops; #else #define romfs_ro_fops generic_ro_fops #endif /* * storage.c */ extern int romfs_dev_read(struct super_block *sb, unsigned long pos, void *buf, size_t buflen); extern ssize_t romfs_dev_strnlen(struct super_block *sb, unsigned long pos, size_t maxlen); extern int romfs_dev_strcmp(struct super_block *sb, unsigned long pos, const char *str, size_t size); |
29 4837 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 | /* * linux/kernel/time/tick-sched.c * * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner * * No idle tick implementation for low and high resolution timers * * Started by: Thomas Gleixner and Ingo Molnar * * Distribute under GPLv2. */ #include <linux/cpu.h> #include <linux/err.h> #include <linux/hrtimer.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/percpu.h> #include <linux/nmi.h> #include <linux/profile.h> #include <linux/sched/signal.h> #include <linux/sched/clock.h> #include <linux/sched/stat.h> #include <linux/sched/nohz.h> #include <linux/module.h> #include <linux/irq_work.h> #include <linux/posix-timers.h> #include <linux/context_tracking.h> #include <linux/mm.h> #include <asm/irq_regs.h> #include "tick-internal.h" #include <trace/events/timer.h> /* * Per-CPU nohz control structure */ static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); struct tick_sched *tick_get_tick_sched(int cpu) { return &per_cpu(tick_cpu_sched, cpu); } #if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) /* * The time, when the last jiffy update happened. Protected by jiffies_lock. */ static ktime_t last_jiffies_update; /* * Must be called with interrupts disabled ! */ static void tick_do_update_jiffies64(ktime_t now) { unsigned long ticks = 0; ktime_t delta; /* * Do a quick check without holding jiffies_lock: * The READ_ONCE() pairs with two updates done later in this function. */ delta = ktime_sub(now, READ_ONCE(last_jiffies_update)); if (delta < tick_period) return; /* Reevaluate with jiffies_lock held */ write_seqlock(&jiffies_lock); delta = ktime_sub(now, last_jiffies_update); if (delta >= tick_period) { delta = ktime_sub(delta, tick_period); /* Pairs with the lockless read in this function. */ WRITE_ONCE(last_jiffies_update, ktime_add(last_jiffies_update, tick_period)); /* Slow path for long timeouts */ if (unlikely(delta >= tick_period)) { s64 incr = ktime_to_ns(tick_period); ticks = ktime_divns(delta, incr); /* Pairs with the lockless read in this function. */ WRITE_ONCE(last_jiffies_update, ktime_add_ns(last_jiffies_update, incr * ticks)); } do_timer(++ticks); /* Keep the tick_next_period variable up to date */ tick_next_period = ktime_add(last_jiffies_update, tick_period); } else { write_sequnlock(&jiffies_lock); return; } write_sequnlock(&jiffies_lock); update_wall_time(); } /* * Initialize and return retrieve the jiffies update. */ static ktime_t tick_init_jiffy_update(void) { ktime_t period; write_seqlock(&jiffies_lock); /* Did we start the jiffies update yet ? */ if (last_jiffies_update == 0) last_jiffies_update = tick_next_period; period = last_jiffies_update; write_sequnlock(&jiffies_lock); return period; } static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) { int cpu = smp_processor_id(); #ifdef CONFIG_NO_HZ_COMMON /* * Check if the do_timer duty was dropped. We don't care about * concurrency: This happens only when the CPU in charge went * into a long sleep. If two CPUs happen to assign themselves to * this duty, then the jiffies update is still serialized by * jiffies_lock. */ if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE) && !tick_nohz_full_cpu(cpu)) tick_do_timer_cpu = cpu; #endif /* Check, if the jiffies need an update */ if (tick_do_timer_cpu == cpu) tick_do_update_jiffies64(now); if (ts->inidle) ts->got_idle_tick = 1; } static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) { #ifdef CONFIG_NO_HZ_COMMON /* * When we are idle and the tick is stopped, we have to touch * the watchdog as we might not schedule for a really long * time. This happens on complete idle SMP systems while * waiting on the login prompt. We also increment the "start of * idle" jiffy stamp so the idle accounting adjustment we do * when we go busy again does not account too much ticks. */ if (ts->tick_stopped) { touch_softlockup_watchdog_sched(); if (is_idle_task(current)) ts->idle_jiffies++; /* * In case the current tick fired too early past its expected * expiration, make sure we don't bypass the next clock reprogramming * to the same deadline. */ ts->next_tick = 0; } #endif update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING); } #endif #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; bool tick_nohz_full_running; static atomic_t tick_dep_mask; static bool check_tick_dependency(atomic_t *dep) { int val = atomic_read(dep); if (val & TICK_DEP_MASK_POSIX_TIMER) { trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER); return true; } if (val & TICK_DEP_MASK_PERF_EVENTS) { trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS); return true; } if (val & TICK_DEP_MASK_SCHED) { trace_tick_stop(0, TICK_DEP_MASK_SCHED); return true; } if (val & TICK_DEP_MASK_CLOCK_UNSTABLE) { trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE); return true; } return false; } static bool can_stop_full_tick(int cpu, struct tick_sched *ts) { lockdep_assert_irqs_disabled(); if (unlikely(!cpu_online(cpu))) return false; if (check_tick_dependency(&tick_dep_mask)) return false; if (check_tick_dependency(&ts->tick_dep_mask)) return false; if (check_tick_dependency(¤t->tick_dep_mask)) return false; if (check_tick_dependency(¤t->signal->tick_dep_mask)) return false; return true; } static void nohz_full_kick_func(struct irq_work *work) { /* Empty, the tick restart happens on tick_nohz_irq_exit() */ } static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { .func = nohz_full_kick_func, }; /* * Kick this CPU if it's full dynticks in order to force it to * re-evaluate its dependency on the tick and restart it if necessary. * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(), * is NMI safe. */ static void tick_nohz_full_kick(void) { if (!tick_nohz_full_cpu(smp_processor_id())) return; irq_work_queue(this_cpu_ptr(&nohz_full_kick_work)); } /* * Kick the CPU if it's full dynticks in order to force it to * re-evaluate its dependency on the tick and restart it if necessary. */ void tick_nohz_full_kick_cpu(int cpu) { if (!tick_nohz_full_cpu(cpu)) return; irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu); } /* * Kick all full dynticks CPUs in order to force these to re-evaluate * their dependency on the tick and restart it if necessary. */ static void tick_nohz_full_kick_all(void) { int cpu; if (!tick_nohz_full_running) return; preempt_disable(); for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask) tick_nohz_full_kick_cpu(cpu); preempt_enable(); } static void tick_nohz_dep_set_all(atomic_t *dep, enum tick_dep_bits bit) { int prev; prev = atomic_fetch_or(BIT(bit), dep); if (!prev) tick_nohz_full_kick_all(); } /* * Set a global tick dependency. Used by perf events that rely on freq and * by unstable clock. */ void tick_nohz_dep_set(enum tick_dep_bits bit) { tick_nohz_dep_set_all(&tick_dep_mask, bit); } void tick_nohz_dep_clear(enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &tick_dep_mask); } /* * Set per-CPU tick dependency. Used by scheduler and perf events in order to * manage events throttling. */ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) { int prev; struct tick_sched *ts; ts = per_cpu_ptr(&tick_cpu_sched, cpu); prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask); if (!prev) { preempt_disable(); /* Perf needs local kick that is NMI safe */ if (cpu == smp_processor_id()) { tick_nohz_full_kick(); } else { /* Remote irq work not NMI-safe */ if (!WARN_ON_ONCE(in_nmi())) tick_nohz_full_kick_cpu(cpu); } preempt_enable(); } } void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) { struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); atomic_andnot(BIT(bit), &ts->tick_dep_mask); } /* * Set a per-task tick dependency. Posix CPU timers need this in order to elapse * per task timers. */ void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) { /* * We could optimize this with just kicking the target running the task * if that noise matters for nohz full users. */ tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit); } void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &tsk->tick_dep_mask); } /* * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse * per process timers. */ void tick_nohz_dep_set_signal(struct signal_struct *sig, enum tick_dep_bits bit) { tick_nohz_dep_set_all(&sig->tick_dep_mask, bit); } void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &sig->tick_dep_mask); } /* * Re-evaluate the need for the tick as we switch the current task. * It might need the tick due to per task/process properties: * perf events, posix CPU timers, ... */ void __tick_nohz_task_switch(void) { unsigned long flags; struct tick_sched *ts; local_irq_save(flags); if (!tick_nohz_full_cpu(smp_processor_id())) goto out; ts = this_cpu_ptr(&tick_cpu_sched); if (ts->tick_stopped) { if (atomic_read(¤t->tick_dep_mask) || atomic_read(¤t->signal->tick_dep_mask)) tick_nohz_full_kick(); } out: local_irq_restore(flags); } /* Get the boot-time nohz CPU list from the kernel parameters. */ void __init tick_nohz_full_setup(cpumask_var_t cpumask) { alloc_bootmem_cpumask_var(&tick_nohz_full_mask); cpumask_copy(tick_nohz_full_mask, cpumask); tick_nohz_full_running = true; } static int tick_nohz_cpu_down(unsigned int cpu) { /* * The boot CPU handles housekeeping duty (unbound timers, * workqueues, timekeeping, ...) on behalf of full dynticks * CPUs. It must remain online when nohz full is enabled. */ if (tick_nohz_full_running && tick_do_timer_cpu == cpu) return -EBUSY; return 0; } void __init tick_nohz_init(void) { int cpu, ret; if (!tick_nohz_full_running) return; /* * Full dynticks uses irq work to drive the tick rescheduling on safe * locking contexts. But then we need irq work to raise its own * interrupts to avoid circular dependency on the tick */ if (!arch_irq_work_has_interrupt()) { pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n"); cpumask_clear(tick_nohz_full_mask); tick_nohz_full_running = false; return; } cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { pr_warn("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); cpumask_clear_cpu(cpu, tick_nohz_full_mask); } for_each_cpu(cpu, tick_nohz_full_mask) context_tracking_cpu_set(cpu); ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "kernel/nohz:predown", NULL, tick_nohz_cpu_down); WARN_ON(ret < 0); pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", cpumask_pr_args(tick_nohz_full_mask)); } #endif /* * NOHZ - aka dynamic tick functionality */ #ifdef CONFIG_NO_HZ_COMMON /* * NO HZ enabled ? */ bool tick_nohz_enabled __read_mostly = true; unsigned long tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode */ static int __init setup_tick_nohz(char *str) { return (kstrtobool(str, &tick_nohz_enabled) == 0); } __setup("nohz=", setup_tick_nohz); bool tick_nohz_tick_stopped(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); return ts->tick_stopped; } bool tick_nohz_tick_stopped_cpu(int cpu) { struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); return ts->tick_stopped; } /** * tick_nohz_update_jiffies - update jiffies when idle was interrupted * * Called from interrupt entry when the CPU was idle * * In case the sched_tick was stopped on this CPU, we have to check if jiffies * must be updated. Otherwise an interrupt handler could use a stale jiffy * value. We do this unconditionally on any CPU, as we don't know whether the * CPU, which has the update task assigned is in a long sleep. */ static void tick_nohz_update_jiffies(ktime_t now) { unsigned long flags; __this_cpu_write(tick_cpu_sched.idle_waketime, now); local_irq_save(flags); tick_do_update_jiffies64(now); local_irq_restore(flags); touch_softlockup_watchdog_sched(); } /* * Updates the per-CPU time idle statistics counters */ static void update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) { ktime_t delta; if (ts->idle_active) { delta = ktime_sub(now, ts->idle_entrytime); if (nr_iowait_cpu(cpu) > 0) ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); else ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); ts->idle_entrytime = now; } if (last_update_time) *last_update_time = ktime_to_us(now); } static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) { update_ts_time_stats(smp_processor_id(), ts, now, NULL); ts->idle_active = 0; sched_clock_idle_wakeup_event(); } static void tick_nohz_start_idle(struct tick_sched *ts) { ts->idle_entrytime = ktime_get(); ts->idle_active = 1; sched_clock_idle_sleep_event(); } /** * get_cpu_idle_time_us - get the total idle time of a CPU * @cpu: CPU number to query * @last_update_time: variable to store update time in. Do not update * counters if NULL. * * Return the cumulative idle time (since boot) for a given * CPU, in microseconds. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. * * This function returns -1 if NOHZ is not enabled. */ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t now, idle; if (!tick_nohz_active) return -1; now = ktime_get(); if (last_update_time) { update_ts_time_stats(cpu, ts, now, last_update_time); idle = ts->idle_sleeptime; } else { if (ts->idle_active && !nr_iowait_cpu(cpu)) { ktime_t delta = ktime_sub(now, ts->idle_entrytime); idle = ktime_add(ts->idle_sleeptime, delta); } else { idle = ts->idle_sleeptime; } } return ktime_to_us(idle); } EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); /** * get_cpu_iowait_time_us - get the total iowait time of a CPU * @cpu: CPU number to query * @last_update_time: variable to store update time in. Do not update * counters if NULL. * * Return the cumulative iowait time (since boot) for a given * CPU, in microseconds. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. * * This function returns -1 if NOHZ is not enabled. */ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t now, iowait; if (!tick_nohz_active) return -1; now = ktime_get(); if (last_update_time) { update_ts_time_stats(cpu, ts, now, last_update_time); iowait = ts->iowait_sleeptime; } else { if (ts->idle_active && nr_iowait_cpu(cpu) > 0) { ktime_t delta = ktime_sub(now, ts->idle_entrytime); iowait = ktime_add(ts->iowait_sleeptime, delta); } else { iowait = ts->iowait_sleeptime; } } return ktime_to_us(iowait); } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) { hrtimer_cancel(&ts->sched_timer); hrtimer_set_expires(&ts->sched_timer, ts->last_tick); /* Forward the time to expire in the future */ hrtimer_forward(&ts->sched_timer, now, tick_period); if (ts->nohz_mode == NOHZ_MODE_HIGHRES) hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); else tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); /* * Reset to make sure next tick stop doesn't get fooled by past * cached clock deadline. */ ts->next_tick = 0; } static inline bool local_timer_softirq_pending(void) { return local_softirq_pending() & BIT(TIMER_SOFTIRQ); } static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) { u64 basemono, next_tick, next_tmr, next_rcu, delta, expires; unsigned long seq, basejiff; /* Read jiffies and the time when jiffies were updated last */ do { seq = read_seqbegin(&jiffies_lock); basemono = last_jiffies_update; basejiff = jiffies; } while (read_seqretry(&jiffies_lock, seq)); ts->last_jiffies = basejiff; ts->timer_expires_base = basemono; /* * Keep the periodic tick, when RCU, architecture or irq_work * requests it. * Aside of that check whether the local timer softirq is * pending. If so its a bad idea to call get_next_timer_interrupt() * because there is an already expired timer, so it will request * immeditate expiry, which rearms the hardware timer with a * minimal delta which brings us back to this place * immediately. Lather, rinse and repeat... */ if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() || irq_work_needs_cpu() || local_timer_softirq_pending()) { next_tick = basemono + TICK_NSEC; } else { /* * Get the next pending timer. If high resolution * timers are enabled this only takes the timer wheel * timers into account. If high resolution timers are * disabled this also looks at the next expiring * hrtimer. */ next_tmr = get_next_timer_interrupt(basejiff, basemono); ts->next_timer = next_tmr; /* Take the next rcu event into account */ next_tick = next_rcu < next_tmr ? next_rcu : next_tmr; } /* * If the tick is due in the next period, keep it ticking or * force prod the timer. */ delta = next_tick - basemono; if (delta <= (u64)TICK_NSEC) { /* * Tell the timer code that the base is not idle, i.e. undo * the effect of get_next_timer_interrupt(): */ timer_clear_idle(); /* * We've not stopped the tick yet, and there's a timer in the * next period, so no point in stopping it either, bail. */ if (!ts->tick_stopped) { ts->timer_expires = 0; goto out; } } /* * If this CPU is the one which had the do_timer() duty last, we limit * the sleep time to the timekeeping max_deferment value. * Otherwise we can sleep as long as we want. */ delta = timekeeping_max_deferment(); if (cpu != tick_do_timer_cpu && (tick_do_timer_cpu != TICK_DO_TIMER_NONE || !ts->do_timer_last)) delta = KTIME_MAX; /* Calculate the next expiry time */ if (delta < (KTIME_MAX - basemono)) expires = basemono + delta; else expires = KTIME_MAX; ts->timer_expires = min_t(u64, expires, next_tick); out: return ts->timer_expires; } static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); u64 basemono = ts->timer_expires_base; u64 expires = ts->timer_expires; ktime_t tick = expires; /* Make sure we won't be trying to stop it twice in a row. */ ts->timer_expires_base = 0; /* * If this CPU is the one which updates jiffies, then give up * the assignment and let it be taken by the CPU which runs * the tick timer next, which might be this CPU as well. If we * don't drop this here the jiffies might be stale and * do_timer() never invoked. Keep track of the fact that it * was the one which had the do_timer() duty last. */ if (cpu == tick_do_timer_cpu) { tick_do_timer_cpu = TICK_DO_TIMER_NONE; ts->do_timer_last = 1; } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { ts->do_timer_last = 0; } /* Skip reprogram of event if its not changed */ if (ts->tick_stopped && (expires == ts->next_tick)) { /* Sanity check: make sure clockevent is actually programmed */ if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) return; WARN_ON_ONCE(1); printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n", basemono, ts->next_tick, dev->next_event, hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer)); } /* * nohz_stop_sched_tick can be called several times before * the nohz_restart_sched_tick is called. This happens when * interrupts arrive which do not cause a reschedule. In the * first call we save the current tick time, so we can restart * the scheduler tick in nohz_restart_sched_tick. */ if (!ts->tick_stopped) { calc_load_nohz_start(); cpu_load_update_nohz_start(); quiet_vmstat(); ts->last_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; trace_tick_stop(1, TICK_DEP_MASK_NONE); } ts->next_tick = tick; /* * If the expiration time == KTIME_MAX, then we simply stop * the tick timer. */ if (unlikely(expires == KTIME_MAX)) { if (ts->nohz_mode == NOHZ_MODE_HIGHRES) hrtimer_cancel(&ts->sched_timer); return; } if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED); } else { hrtimer_set_expires(&ts->sched_timer, tick); tick_program_event(tick, 1); } } static void tick_nohz_retain_tick(struct tick_sched *ts) { ts->timer_expires_base = 0; } #ifdef CONFIG_NO_HZ_FULL static void tick_nohz_stop_sched_tick(struct tick_sched *ts, int cpu) { if (tick_nohz_next_event(ts, cpu)) tick_nohz_stop_tick(ts, cpu); else tick_nohz_retain_tick(ts); } #endif /* CONFIG_NO_HZ_FULL */ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { /* Update jiffies first */ tick_do_update_jiffies64(now); cpu_load_update_nohz_stop(); /* * Clear the timer idle flag, so we avoid IPIs on remote queueing and * the clock forward checks in the enqueue path: */ timer_clear_idle(); calc_load_nohz_stop(); touch_softlockup_watchdog_sched(); /* * Cancel the scheduled timer and restore the tick */ ts->tick_stopped = 0; ts->idle_exittime = now; tick_nohz_restart(ts, now); } static void tick_nohz_full_update_tick(struct tick_sched *ts) { #ifdef CONFIG_NO_HZ_FULL int cpu = smp_processor_id(); if (!tick_nohz_full_cpu(cpu)) return; if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) return; if (can_stop_full_tick(cpu, ts)) tick_nohz_stop_sched_tick(ts, cpu); else if (ts->tick_stopped) tick_nohz_restart_sched_tick(ts, ktime_get()); #endif } static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) { /* * If this CPU is offline and it is the one which updates * jiffies, then give up the assignment and let it be taken by * the CPU which runs the tick timer next. If we don't drop * this here the jiffies might be stale and do_timer() never * invoked. */ if (unlikely(!cpu_online(cpu))) { if (cpu == tick_do_timer_cpu) tick_do_timer_cpu = TICK_DO_TIMER_NONE; /* * Make sure the CPU doesn't get fooled by obsolete tick * deadline if it comes back online later. */ ts->next_tick = 0; return false; } if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) return false; if (need_resched()) return false; if (unlikely(local_softirq_pending() && cpu_online(cpu))) { static int ratelimit; if (ratelimit < 10 && (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { pr_warn("NOHZ: local_softirq_pending %02x\n", (unsigned int) local_softirq_pending()); ratelimit++; } return false; } if (tick_nohz_full_enabled()) { /* * Keep the tick alive to guarantee timekeeping progression * if there are full dynticks CPUs around */ if (tick_do_timer_cpu == cpu) return false; /* * Boot safety: make sure the timekeeping duty has been * assigned before entering dyntick-idle mode, */ if (tick_do_timer_cpu == TICK_DO_TIMER_NONE) return false; } return true; } static void __tick_nohz_idle_stop_tick(struct tick_sched *ts) { ktime_t expires; int cpu = smp_processor_id(); /* * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the * tick timer expiration time is known already. */ if (ts->timer_expires_base) expires = ts->timer_expires; else if (can_stop_idle_tick(cpu, ts)) expires = tick_nohz_next_event(ts, cpu); else return; ts->idle_calls++; if (expires > 0LL) { int was_stopped = ts->tick_stopped; tick_nohz_stop_tick(ts, cpu); ts->idle_sleeps++; ts->idle_expires = expires; if (!was_stopped && ts->tick_stopped) { ts->idle_jiffies = ts->last_jiffies; nohz_balance_enter_idle(cpu); } } else { tick_nohz_retain_tick(ts); } } /** * tick_nohz_idle_stop_tick - stop the idle tick from the idle task * * When the next event is more than a tick into the future, stop the idle tick */ void tick_nohz_idle_stop_tick(void) { __tick_nohz_idle_stop_tick(this_cpu_ptr(&tick_cpu_sched)); } void tick_nohz_idle_retain_tick(void) { tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched)); /* * Undo the effect of get_next_timer_interrupt() called from * tick_nohz_next_event(). */ timer_clear_idle(); } /** * tick_nohz_idle_enter - prepare for entering idle on the current CPU * * Called when we start the idle loop. */ void tick_nohz_idle_enter(void) { struct tick_sched *ts; lockdep_assert_irqs_enabled(); local_irq_disable(); ts = this_cpu_ptr(&tick_cpu_sched); WARN_ON_ONCE(ts->timer_expires_base); ts->inidle = 1; tick_nohz_start_idle(ts); local_irq_enable(); } /** * tick_nohz_irq_exit - update next tick event from interrupt exit * * When an interrupt fires while we are idle and it doesn't cause * a reschedule, it may still add, modify or delete a timer, enqueue * an RCU callback, etc... * So we need to re-calculate and reprogram the next tick event. */ void tick_nohz_irq_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (ts->inidle) tick_nohz_start_idle(ts); else tick_nohz_full_update_tick(ts); } /** * tick_nohz_idle_got_tick - Check whether or not the tick handler has run */ bool tick_nohz_idle_got_tick(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (ts->got_idle_tick) { ts->got_idle_tick = 0; return true; } return false; } /** * tick_nohz_get_sleep_length - return the expected length of the current sleep * @delta_next: duration until the next event if the tick cannot be stopped * * Called from power state control code with interrupts disabled */ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); int cpu = smp_processor_id(); /* * The idle entry time is expected to be a sufficient approximation of * the current time at this point. */ ktime_t now = ts->idle_entrytime; ktime_t next_event; WARN_ON_ONCE(!ts->inidle); *delta_next = ktime_sub(dev->next_event, now); if (!can_stop_idle_tick(cpu, ts)) return *delta_next; next_event = tick_nohz_next_event(ts, cpu); if (!next_event) return *delta_next; /* * If the next highres timer to expire is earlier than next_event, the * idle governor needs to know that. */ next_event = min_t(u64, next_event, hrtimer_next_event_without(&ts->sched_timer)); return ktime_sub(next_event, now); } /** * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value * for a particular CPU. * * Called from the schedutil frequency scaling governor in scheduler context. */ unsigned long tick_nohz_get_idle_calls_cpu(int cpu) { struct tick_sched *ts = tick_get_tick_sched(cpu); return ts->idle_calls; } /** * tick_nohz_get_idle_calls - return the current idle calls counter value * * Called from the schedutil frequency scaling governor in scheduler context. */ unsigned long tick_nohz_get_idle_calls(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); return ts->idle_calls; } static void tick_nohz_account_idle_ticks(struct tick_sched *ts) { #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE unsigned long ticks; if (vtime_accounting_cpu_enabled()) return; /* * We stopped the tick in idle. Update process times would miss the * time we slept as update_process_times does only a 1 tick * accounting. Enforce that this is accounted to idle ! */ ticks = jiffies - ts->idle_jiffies; /* * We might be one off. Do not randomly account a huge number of ticks! */ if (ticks && ticks < LONG_MAX) account_idle_ticks(ticks); #endif } static void __tick_nohz_idle_restart_tick(struct tick_sched *ts, ktime_t now) { tick_nohz_restart_sched_tick(ts, now); tick_nohz_account_idle_ticks(ts); } void tick_nohz_idle_restart_tick(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (ts->tick_stopped) __tick_nohz_idle_restart_tick(ts, ktime_get()); } /** * tick_nohz_idle_exit - restart the idle tick from the idle task * * Restart the idle tick when the CPU is woken up from idle * This also exit the RCU extended quiescent state. The CPU * can use RCU again after this function is called. */ void tick_nohz_idle_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); bool idle_active, tick_stopped; ktime_t now; local_irq_disable(); WARN_ON_ONCE(!ts->inidle); WARN_ON_ONCE(ts->timer_expires_base); ts->inidle = 0; idle_active = ts->idle_active; tick_stopped = ts->tick_stopped; if (idle_active || tick_stopped) now = ktime_get(); if (idle_active) tick_nohz_stop_idle(ts, now); if (tick_stopped) __tick_nohz_idle_restart_tick(ts, now); local_irq_enable(); } /* * The nohz low res interrupt handler */ static void tick_nohz_handler(struct clock_event_device *dev) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); dev->next_event = KTIME_MAX; tick_sched_do_timer(ts, now); tick_sched_handle(ts, regs); /* No need to reprogram if we are running tickless */ if (unlikely(ts->tick_stopped)) return; hrtimer_forward(&ts->sched_timer, now, tick_period); tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); } static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { if (!tick_nohz_enabled) return; ts->nohz_mode = mode; /* One update is enough */ if (!test_and_set_bit(0, &tick_nohz_active)) timers_update_nohz(); } /** * tick_nohz_switch_to_nohz - switch to nohz mode */ static void tick_nohz_switch_to_nohz(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t next; if (!tick_nohz_enabled) return; if (tick_switch_to_oneshot(tick_nohz_handler)) return; /* * Recycle the hrtimer in ts, so we can share the * hrtimer_forward with the highres code. */ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); /* Get the next period */ next = tick_init_jiffy_update(); hrtimer_set_expires(&ts->sched_timer, next); hrtimer_forward_now(&ts->sched_timer, tick_period); tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); tick_nohz_activate(ts, NOHZ_MODE_LOWRES); } static inline void tick_nohz_irq_enter(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t now; if (!ts->idle_active && !ts->tick_stopped) return; now = ktime_get(); if (ts->idle_active) tick_nohz_stop_idle(ts, now); if (ts->tick_stopped) tick_nohz_update_jiffies(now); } #else static inline void tick_nohz_switch_to_nohz(void) { } static inline void tick_nohz_irq_enter(void) { } static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { } #endif /* CONFIG_NO_HZ_COMMON */ /* * Called from irq_enter to notify about the possible interruption of idle() */ void tick_irq_enter(void) { tick_check_oneshot_broadcast_this_cpu(); tick_nohz_irq_enter(); } /* * High resolution timer specific code */ #ifdef CONFIG_HIGH_RES_TIMERS /* * We rearm the timer until we get disabled by the idle code. * Called with interrupts disabled. */ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) { struct tick_sched *ts = container_of(timer, struct tick_sched, sched_timer); struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); tick_sched_do_timer(ts, now); /* * Do not call, when we are not in irq context and have * no valid regs pointer */ if (regs) tick_sched_handle(ts, regs); else ts->next_tick = 0; /* No need to reprogram if we are in idle or full dynticks mode */ if (unlikely(ts->tick_stopped)) return HRTIMER_NORESTART; hrtimer_forward(timer, now, tick_period); return HRTIMER_RESTART; } static int sched_skew_tick; static int __init skew_tick(char *str) { get_option(&str, &sched_skew_tick); return 0; } early_param("skew_tick", skew_tick); /** * tick_setup_sched_timer - setup the tick emulation timer */ void tick_setup_sched_timer(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t now = ktime_get(); /* * Emulate tick processing via per-CPU hrtimers: */ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); ts->sched_timer.function = tick_sched_timer; /* Get the next period (per-CPU) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); /* Offset the tick to avert jiffies_lock contention. */ if (sched_skew_tick) { u64 offset = ktime_to_ns(tick_period) >> 1; do_div(offset, num_possible_cpus()); offset *= smp_processor_id(); hrtimer_add_expires_ns(&ts->sched_timer, offset); } hrtimer_forward(&ts->sched_timer, now, tick_period); hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); tick_nohz_activate(ts, NOHZ_MODE_HIGHRES); } #endif /* HIGH_RES_TIMERS */ #if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS void tick_cancel_sched_timer(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); # ifdef CONFIG_HIGH_RES_TIMERS if (ts->sched_timer.base) hrtimer_cancel(&ts->sched_timer); # endif memset(ts, 0, sizeof(*ts)); } #endif /** * Async notification about clocksource changes */ void tick_clock_notify(void) { int cpu; for_each_possible_cpu(cpu) set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks); } /* * Async notification about clock event changes */ void tick_oneshot_notify(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); set_bit(0, &ts->check_clocks); } /** * Check, if a change happened, which makes oneshot possible. * * Called cyclic from the hrtimer softirq (driven by the timer * softirq) allow_nohz signals, that we can switch into low-res nohz * mode, because high resolution timers are disabled (either compile * or runtime). Called with interrupts disabled. */ int tick_check_oneshot_change(int allow_nohz) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (!test_and_clear_bit(0, &ts->check_clocks)) return 0; if (ts->nohz_mode != NOHZ_MODE_INACTIVE) return 0; if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available()) return 0; if (!allow_nohz) return 1; tick_nohz_switch_to_nohz(); return 0; } |
9 6 6 41 43 14 70 15 29 36 50 6 5 7 7 4 4 1 3 2 1 1 3 1 13 109 42 102 89 75 69 56 40 34 33 51 51 56 47 43 41 41 41 37 51 2 8 2 2 2 2 2 1 2 2 25 21 22 2 2 1 1 1 23 8 5 3 2 2 2 5 7 5 4 2 2 1 3 6 5 6 21 21 37 30 27 5 26 32 16 18 16 36 33 7 7 21 7 4 4 3 6 118 117 115 115 113 111 117 115 2 1 3 3 92 90 3 1 91 89 2 89 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 3 2 1 1 1 10 10 1 9 10 5 2 2 6 1 1 3 3 4 3 3 19 2 19 3 19 13 19 3 1 19 3 16 30 6 29 27 19 1 19 18 19 18 1 1 1 1 19 19 13 19 12 19 13 10 7 5 3 4 1 4 3 13 12 17 17 1 10 9 2 2 7 9 7 4 1 4 19 49 39 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 | /* FUSE: Filesystem in Userspace Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> This program can be distributed under the terms of the GNU GPL. See the file COPYING. */ #include "fuse_i.h" #include <linux/pagemap.h> #include <linux/file.h> #include <linux/sched.h> #include <linux/namei.h> #include <linux/slab.h> #include <linux/xattr.h> #include <linux/posix_acl.h> static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx) { struct fuse_conn *fc = get_fuse_conn(dir); struct fuse_inode *fi = get_fuse_inode(dir); if (!fc->do_readdirplus) return false; if (!fc->readdirplus_auto) return true; if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state)) return true; if (ctx->pos == 0) return true; return false; } static void fuse_advise_use_readdirplus(struct inode *dir) { struct fuse_inode *fi = get_fuse_inode(dir); set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state); } union fuse_dentry { u64 time; struct rcu_head rcu; }; static inline void fuse_dentry_settime(struct dentry *entry, u64 time) { ((union fuse_dentry *) entry->d_fsdata)->time = time; } static inline u64 fuse_dentry_time(struct dentry *entry) { return ((union fuse_dentry *) entry->d_fsdata)->time; } /* * FUSE caches dentries and attributes with separate timeout. The * time in jiffies until the dentry/attributes are valid is stored in * dentry->d_fsdata and fuse_inode->i_time respectively. */ /* * Calculate the time in jiffies until a dentry/attributes are valid */ static u64 time_to_jiffies(u64 sec, u32 nsec) { if (sec || nsec) { struct timespec64 ts = { sec, min_t(u32, nsec, NSEC_PER_SEC - 1) }; return get_jiffies_64() + timespec64_to_jiffies(&ts); } else return 0; } /* * Set dentry and possibly attribute timeouts from the lookup/mk* * replies */ static void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o) { fuse_dentry_settime(entry, time_to_jiffies(o->entry_valid, o->entry_valid_nsec)); } static u64 attr_timeout(struct fuse_attr_out *o) { return time_to_jiffies(o->attr_valid, o->attr_valid_nsec); } static u64 entry_attr_timeout(struct fuse_entry_out *o) { return time_to_jiffies(o->attr_valid, o->attr_valid_nsec); } /* * Mark the attributes as stale, so that at the next call to * ->getattr() they will be fetched from userspace */ void fuse_invalidate_attr(struct inode *inode) { get_fuse_inode(inode)->i_time = 0; } /** * Mark the attributes as stale due to an atime change. Avoid the invalidate if * atime is not used. */ void fuse_invalidate_atime(struct inode *inode) { if (!IS_RDONLY(inode)) fuse_invalidate_attr(inode); } /* * Just mark the entry as stale, so that a next attempt to look it up * will result in a new lookup call to userspace * * This is called when a dentry is about to become negative and the * timeout is unknown (unlink, rmdir, rename and in some cases * lookup) */ void fuse_invalidate_entry_cache(struct dentry *entry) { fuse_dentry_settime(entry, 0); } /* * Same as fuse_invalidate_entry_cache(), but also try to remove the * dentry from the hash */ static void fuse_invalidate_entry(struct dentry *entry) { d_invalidate(entry); fuse_invalidate_entry_cache(entry); } static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args, u64 nodeid, const struct qstr *name, struct fuse_entry_out *outarg) { memset(outarg, 0, sizeof(struct fuse_entry_out)); args->in.h.opcode = FUSE_LOOKUP; args->in.h.nodeid = nodeid; args->in.numargs = 1; args->in.args[0].size = name->len + 1; args->in.args[0].value = name->name; args->out.numargs = 1; args->out.args[0].size = sizeof(struct fuse_entry_out); args->out.args[0].value = outarg; } u64 fuse_get_attr_version(struct fuse_conn *fc) { u64 curr_version; /* * The spin lock isn't actually needed on 64bit archs, but we * don't yet care too much about such optimizations. */ spin_lock(&fc->lock); curr_version = fc->attr_version; spin_unlock(&fc->lock); return curr_version; } /* * Check whether the dentry is still valid * * If the entry validity timeout has expired and the dentry is * positive, try to redo the lookup. If the lookup results in a * different inode, then let the VFS invalidate the dentry and redo * the lookup once more. If the lookup results in the same inode, * then refresh the attributes, timeouts and mark the dentry valid. */ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) { struct inode *inode; struct dentry *parent; struct fuse_conn *fc; struct fuse_inode *fi; int ret; inode = d_inode_rcu(entry); if (inode && fuse_is_bad(inode)) goto invalid; else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) || (flags & LOOKUP_REVAL)) { struct fuse_entry_out outarg; FUSE_ARGS(args); struct fuse_forget_link *forget; u64 attr_version; /* For negative dentries, always do a fresh lookup */ if (!inode) goto invalid; ret = -ECHILD; if (flags & LOOKUP_RCU) goto out; fc = get_fuse_conn(inode); forget = fuse_alloc_forget(); ret = -ENOMEM; if (!forget) goto out; attr_version = fuse_get_attr_version(fc); parent = dget_parent(entry); fuse_lookup_init(fc, &args, get_node_id(d_inode(parent)), &entry->d_name, &outarg); ret = fuse_simple_request(fc, &args); dput(parent); /* Zero nodeid is same as -ENOENT */ if (!ret && !outarg.nodeid) ret = -ENOENT; if (!ret) { fi = get_fuse_inode(inode); if (outarg.nodeid != get_node_id(inode)) { fuse_queue_forget(fc, forget, outarg.nodeid, 1); goto invalid; } spin_lock(&fc->lock); fi->nlookup++; spin_unlock(&fc->lock); } kfree(forget); if (ret == -ENOMEM) goto out; if (ret || fuse_invalid_attr(&outarg.attr) || (outarg.attr.mode ^ inode->i_mode) & S_IFMT) goto invalid; forget_all_cached_acls(inode); fuse_change_attributes(inode, &outarg.attr, entry_attr_timeout(&outarg), attr_version); fuse_change_entry_timeout(entry, &outarg); } else if (inode) { fi = get_fuse_inode(inode); if (flags & LOOKUP_RCU) { if (test_bit(FUSE_I_INIT_RDPLUS, &fi->state)) return -ECHILD; } else if (test_and_clear_bit(FUSE_I_INIT_RDPLUS, &fi->state)) { parent = dget_parent(entry); fuse_advise_use_readdirplus(d_inode(parent)); dput(parent); } } ret = 1; out: return ret; invalid: ret = 0; goto out; } static int invalid_nodeid(u64 nodeid) { return !nodeid || nodeid == FUSE_ROOT_ID; } static int fuse_dentry_init(struct dentry *dentry) { dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL); return dentry->d_fsdata ? 0 : -ENOMEM; } static void fuse_dentry_release(struct dentry *dentry) { union fuse_dentry *fd = dentry->d_fsdata; kfree_rcu(fd, rcu); } const struct dentry_operations fuse_dentry_operations = { .d_revalidate = fuse_dentry_revalidate, .d_init = fuse_dentry_init, .d_release = fuse_dentry_release, }; const struct dentry_operations fuse_root_dentry_operations = { .d_init = fuse_dentry_init, .d_release = fuse_dentry_release, }; int fuse_valid_type(int m) { return S_ISREG(m) || S_ISDIR(m) || S_ISLNK(m) || S_ISCHR(m) || S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m); } bool fuse_invalid_attr(struct fuse_attr *attr) { return !fuse_valid_type(attr->mode) || attr->size > LLONG_MAX; } int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name, struct fuse_entry_out *outarg, struct inode **inode) { struct fuse_conn *fc = get_fuse_conn_super(sb); FUSE_ARGS(args); struct fuse_forget_link *forget; u64 attr_version; int err; *inode = NULL; err = -ENAMETOOLONG; if (name->len > FUSE_NAME_MAX) goto out; forget = fuse_alloc_forget(); err = -ENOMEM; if (!forget) goto out; attr_version = fuse_get_attr_version(fc); fuse_lookup_init(fc, &args, nodeid, name, outarg); err = fuse_simple_request(fc, &args); /* Zero nodeid is same as -ENOENT, but with valid timeout */ if (err || !outarg->nodeid) goto out_put_forget; err = -EIO; if (!outarg->nodeid) goto out_put_forget; if (fuse_invalid_attr(&outarg->attr)) goto out_put_forget; *inode = fuse_iget(sb, outarg->nodeid, outarg->generation, &outarg->attr, entry_attr_timeout(outarg), attr_version); err = -ENOMEM; if (!*inode) { fuse_queue_forget(fc, forget, outarg->nodeid, 1); goto out; } err = 0; out_put_forget: kfree(forget); out: return err; } static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, unsigned int flags) { int err; struct fuse_entry_out outarg; struct inode *inode; struct dentry *newent; bool outarg_valid = true; bool locked; if (fuse_is_bad(dir)) return ERR_PTR(-EIO); locked = fuse_lock_inode(dir); err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name, &outarg, &inode); fuse_unlock_inode(dir, locked); if (err == -ENOENT) { outarg_valid = false; err = 0; } if (err) goto out_err; err = -EIO; if (inode && get_node_id(inode) == FUSE_ROOT_ID) goto out_iput; newent = d_splice_alias(inode, entry); err = PTR_ERR(newent); if (IS_ERR(newent)) goto out_err; entry = newent ? newent : entry; if (outarg_valid) fuse_change_entry_timeout(entry, &outarg); else fuse_invalidate_entry_cache(entry); fuse_advise_use_readdirplus(dir); return newent; out_iput: iput(inode); out_err: return ERR_PTR(err); } /* * Atomic create+open operation * * If the filesystem doesn't support this, then fall back to separate * 'mknod' + 'open' requests. */ static int fuse_create_open(struct inode *dir, struct dentry *entry, struct file *file, unsigned flags, umode_t mode) { int err; struct inode *inode; struct fuse_conn *fc = get_fuse_conn(dir); FUSE_ARGS(args); struct fuse_forget_link *forget; struct fuse_create_in inarg; struct fuse_open_out outopen; struct fuse_entry_out outentry; struct fuse_file *ff; /* Userspace expects S_IFREG in create mode */ BUG_ON((mode & S_IFMT) != S_IFREG); forget = fuse_alloc_forget(); err = -ENOMEM; if (!forget) goto out_err; err = -ENOMEM; ff = fuse_file_alloc(fc); if (!ff) goto out_put_forget_req; if (!fc->dont_mask) mode &= ~current_umask(); flags &= ~O_NOCTTY; memset(&inarg, 0, sizeof(inarg)); memset(&outentry, 0, sizeof(outentry)); inarg.flags = flags; inarg.mode = mode; inarg.umask = current_umask(); args.in.h.opcode = FUSE_CREATE; args.in.h.nodeid = get_node_id(dir); args.in.numargs = 2; args.in.args[0].size = sizeof(inarg); args.in.args[0].value = &inarg; args.in.args[1].size = entry->d_name.len + 1; args.in.args[1].value = entry->d_name.name; args.out.numargs = 2; args.out.args[0].size = sizeof(outentry); args.out.args[0].value = &outentry; args.out.args[1].size = sizeof(outopen); args.out.args[1].value = &outopen; err = fuse_simple_request(fc, &args); if (err) goto out_free_ff; err = -EIO; if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid) || fuse_invalid_attr(&outentry.attr)) goto out_free_ff; ff->fh = outopen.fh; ff->nodeid = outentry.nodeid; ff->open_flags = outopen.open_flags; inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation, &outentry.attr, entry_attr_timeout(&outentry), 0); if (!inode) { flags &= ~(O_CREAT | O_EXCL | O_TRUNC); fuse_sync_release(ff, flags); fuse_queue_forget(fc, forget, outentry.nodeid, 1); err = -ENOMEM; goto out_err; } kfree(forget); d_instantiate(entry, inode); fuse_change_entry_timeout(entry, &outentry); fuse_invalidate_attr(dir); err = finish_open(file, entry, generic_file_open); if (err) { fuse_sync_release(ff, flags); } else { file->private_data = ff; fuse_finish_open(inode, file); } return err; out_free_ff: fuse_file_free(ff); out_put_forget_req: kfree(forget); out_err: return err; } static int fuse_mknod(struct inode *, struct dentry *, umode_t, dev_t); static int fuse_atomic_open(struct inode *dir, struct dentry *entry, struct file *file, unsigned flags, umode_t mode) { int err; struct fuse_conn *fc = get_fuse_conn(dir); struct dentry *res = NULL; if (fuse_is_bad(dir)) return -EIO; if (d_in_lookup(entry)) { res = fuse_lookup(dir, entry, 0); if (IS_ERR(res)) return PTR_ERR(res); if (res) entry = res; } if (!(flags & O_CREAT) || d_really_is_positive(entry)) goto no_open; /* Only creates */ file->f_mode |= FMODE_CREATED; if (fc->no_create) goto mknod; err = fuse_create_open(dir, entry, file, flags, mode); if (err == -ENOSYS) { fc->no_create = 1; goto mknod; } out_dput: dput(res); return err; mknod: err = fuse_mknod(dir, entry, mode, 0); if (err) goto out_dput; no_open: return finish_no_open(file, res); } /* * Code shared between mknod, mkdir, symlink and link */ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args, struct inode *dir, struct dentry *entry, umode_t mode) { struct fuse_entry_out outarg; struct inode *inode; struct dentry *d; int err; struct fuse_forget_link *forget; if (fuse_is_bad(dir)) return -EIO; forget = fuse_alloc_forget(); if (!forget) return -ENOMEM; memset(&outarg, 0, sizeof(outarg)); args->in.h.nodeid = get_node_id(dir); args->out.numargs = 1; args->out.args[0].size = sizeof(outarg); args->out.args[0].value = &outarg; err = fuse_simple_request(fc, args); if (err) goto out_put_forget_req; err = -EIO; if (invalid_nodeid(outarg.nodeid) || fuse_invalid_attr(&outarg.attr)) goto out_put_forget_req; if ((outarg.attr.mode ^ mode) & S_IFMT) goto out_put_forget_req; inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, &outarg.attr, entry_attr_timeout(&outarg), 0); if (!inode) { fuse_queue_forget(fc, forget, outarg.nodeid, 1); return -ENOMEM; } kfree(forget); d_drop(entry); d = d_splice_alias(inode, entry); if (IS_ERR(d)) return PTR_ERR(d); if (d) { fuse_change_entry_timeout(d, &outarg); dput(d); } else { fuse_change_entry_timeout(entry, &outarg); } fuse_invalidate_attr(dir); return 0; out_put_forget_req: kfree(forget); return err; } static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode, dev_t rdev) { struct fuse_mknod_in inarg; struct fuse_conn *fc = get_fuse_conn(dir); FUSE_ARGS(args); if (!fc->dont_mask) mode &= ~current_umask(); memset(&inarg, 0, sizeof(inarg)); inarg.mode = mode; inarg.rdev = new_encode_dev(rdev); inarg.umask = current_umask(); args.in.h.opcode = FUSE_MKNOD; args.in.numargs = 2; args.in.args[0].size = sizeof(inarg); args.in.args[0].value = &inarg; args.in.args[1].size = entry->d_name.len + 1; args.in.args[1].value = entry->d_name.name; return create_new_entry(fc, &args, dir, entry, mode); } static int fuse_create(struct inode *dir, struct dentry *entry, umode_t mode, bool excl) { return fuse_mknod(dir, entry, mode, 0); } static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode) { struct fuse_mkdir_in inarg; struct fuse_conn *fc = get_fuse_conn(dir); FUSE_ARGS(args); if (!fc->dont_mask) mode &= ~current_umask(); memset(&inarg, 0, sizeof(inarg)); inarg.mode = mode; inarg.umask = current_umask(); args.in.h.opcode = FUSE_MKDIR; args.in.numargs = 2; args.in.args[0].size = sizeof(inarg); args.in.args[0].value = &inarg; args.in.args[1].size = entry->d_name.len + 1; args.in.args[1].value = entry->d_name.name; return create_new_entry(fc, &args, dir, entry, S_IFDIR); } static int fuse_symlink(struct inode *dir, struct dentry *entry, const char *link) { struct fuse_conn *fc = get_fuse_conn(dir); unsigned len = strlen(link) + 1; FUSE_ARGS(args); args.in.h.opcode = FUSE_SYMLINK; args.in.numargs = 2; args.in.args[0].size = entry->d_name.len + 1; args.in.args[0].value = entry->d_name.name; args.in.args[1].size = len; args.in.args[1].value = link; return create_new_entry(fc, &args, dir, entry, S_IFLNK); } void fuse_update_ctime(struct inode *inode) { if (!IS_NOCMTIME(inode)) { inode->i_ctime = current_time(inode); mark_inode_dirty_sync(inode); } } static int fuse_unlink(struct inode *dir, struct dentry *entry) { int err; struct fuse_conn *fc = get_fuse_conn(dir); FUSE_ARGS(args); if (fuse_is_bad(dir)) return -EIO; args.in.h.opcode = FUSE_UNLINK; args.in.h.nodeid = get_node_id(dir); args.in.numargs = 1; args.in.args[0].size = entry->d_name.len + 1; args.in.args[0].value = entry->d_name.name; err = fuse_simple_request(fc, &args); if (!err) { struct inode *inode = d_inode(entry); struct fuse_inode *fi = get_fuse_inode(inode); spin_lock(&fc->lock); fi->attr_version = ++fc->attr_version; /* * If i_nlink == 0 then unlink doesn't make sense, yet this can * happen if userspace filesystem is careless. It would be * difficult to enforce correct nlink usage so just ignore this * condition here */ if (inode->i_nlink > 0) drop_nlink(inode); spin_unlock(&fc->lock); fuse_invalidate_attr(inode); fuse_invalidate_attr(dir); fuse_invalidate_entry_cache(entry); fuse_update_ctime(inode); } else if (err == -EINTR) fuse_invalidate_entry(entry); return err; } static int fuse_rmdir(struct inode *dir, struct dentry *entry) { int err; struct fuse_conn *fc = get_fuse_conn(dir); FUSE_ARGS(args); if (fuse_is_bad(dir)) return -EIO; args.in.h.opcode = FUSE_RMDIR; args.in.h.nodeid = get_node_id(dir); args.in.numargs = 1; args.in.args[0].size = entry->d_name.len + 1; args.in.args[0].value = entry->d_name.name; err = fuse_simple_request(fc, &args); if (!err) { clear_nlink(d_inode(entry)); fuse_invalidate_attr(dir); fuse_invalidate_entry_cache(entry); } else if (err == -EINTR) fuse_invalidate_entry(entry); return err; } static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, struct inode *newdir, struct dentry *newent, unsigned int flags, int opcode, size_t argsize) { int err; struct fuse_rename2_in inarg; struct fuse_conn *fc = get_fuse_conn(olddir); FUSE_ARGS(args); memset(&inarg, 0, argsize); inarg.newdir = get_node_id(newdir); inarg.flags = flags; args.in.h.opcode = opcode; args.in.h.nodeid = get_node_id(olddir); args.in.numargs = 3; args.in.args[0].size = argsize; args.in.args[0].value = &inarg; args.in.args[1].size = oldent->d_name.len + 1; args.in.args[1].value = oldent->d_name.name; args.in.args[2].size = newent->d_name.len + 1; args.in.args[2].value = newent->d_name.name; err = fuse_simple_request(fc, &args); if (!err) { /* ctime changes */ fuse_invalidate_attr(d_inode(oldent)); fuse_update_ctime(d_inode(oldent)); if (flags & RENAME_EXCHANGE) { fuse_invalidate_attr(d_inode(newent)); fuse_update_ctime(d_inode(newent)); } fuse_invalidate_attr(olddir); if (olddir != newdir) fuse_invalidate_attr(newdir); /* newent will end up negative */ if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent)) { fuse_invalidate_attr(d_inode(newent)); fuse_invalidate_entry_cache(newent); fuse_update_ctime(d_inode(newent)); } } else if (err == -EINTR) { /* If request was interrupted, DEITY only knows if the rename actually took place. If the invalidation fails (e.g. some process has CWD under the renamed directory), then there can be inconsistency between the dcache and the real filesystem. Tough luck. */ fuse_invalidate_entry(oldent); if (d_really_is_positive(newent)) fuse_invalidate_entry(newent); } return err; } static int fuse_rename2(struct inode *olddir, struct dentry *oldent, struct inode *newdir, struct dentry *newent, unsigned int flags) { struct fuse_conn *fc = get_fuse_conn(olddir); int err; if (fuse_is_bad(olddir)) return -EIO; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) return -EINVAL; if (flags) { if (fc->no_rename2 || fc->minor < 23) return -EINVAL; err = fuse_rename_common(olddir, oldent, newdir, newent, flags, FUSE_RENAME2, sizeof(struct fuse_rename2_in)); if (err == -ENOSYS) { fc->no_rename2 = 1; err = -EINVAL; } } else { err = fuse_rename_common(olddir, oldent, newdir, newent, 0, FUSE_RENAME, sizeof(struct fuse_rename_in)); } return err; } static int fuse_link(struct dentry *entry, struct inode *newdir, struct dentry *newent) { int err; struct fuse_link_in inarg; struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); FUSE_ARGS(args); memset(&inarg, 0, sizeof(inarg)); inarg.oldnodeid = get_node_id(inode); args.in.h.opcode = FUSE_LINK; args.in.numargs = 2; args.in.args[0].size = sizeof(inarg); args.in.args[0].value = &inarg; args.in.args[1].size = newent->d_name.len + 1; args.in.args[1].value = newent->d_name.name; err = create_new_entry(fc, &args, newdir, newent, inode->i_mode); /* Contrary to "normal" filesystems it can happen that link makes two "logical" inodes point to the same "physical" inode. We invalidate the attributes of the old one, so it will reflect changes in the backing inode (link count, etc.) */ if (!err) { struct fuse_inode *fi = get_fuse_inode(inode); spin_lock(&fc->lock); fi->attr_version = ++fc->attr_version; if (likely(inode->i_nlink < UINT_MAX)) inc_nlink(inode); spin_unlock(&fc->lock); fuse_invalidate_attr(inode); fuse_update_ctime(inode); } else if (err == -EINTR) { fuse_invalidate_attr(inode); } return err; } static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, struct kstat *stat) { unsigned int blkbits; struct fuse_conn *fc = get_fuse_conn(inode); /* see the comment in fuse_change_attributes() */ if (fc->writeback_cache && S_ISREG(inode->i_mode)) { attr->size = i_size_read(inode); attr->mtime = inode->i_mtime.tv_sec; attr->mtimensec = inode->i_mtime.tv_nsec; attr->ctime = inode->i_ctime.tv_sec; attr->ctimensec = inode->i_ctime.tv_nsec; } stat->dev = inode->i_sb->s_dev; stat->ino = attr->ino; stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); stat->nlink = attr->nlink; stat->uid = make_kuid(fc->user_ns, attr->uid); stat->gid = make_kgid(fc->user_ns, attr->gid); stat->rdev = inode->i_rdev; stat->atime.tv_sec = attr->atime; stat->atime.tv_nsec = attr->atimensec; stat->mtime.tv_sec = attr->mtime; stat->mtime.tv_nsec = attr->mtimensec; stat->ctime.tv_sec = attr->ctime; stat->ctime.tv_nsec = attr->ctimensec; stat->size = attr->size; stat->blocks = attr->blocks; if (attr->blksize != 0) blkbits = ilog2(attr->blksize); else blkbits = inode->i_sb->s_blocksize_bits; stat->blksize = 1 << blkbits; } static int fuse_do_getattr(struct inode *inode, struct kstat *stat, struct file *file) { int err; struct fuse_getattr_in inarg; struct fuse_attr_out outarg; struct fuse_conn *fc = get_fuse_conn(inode); FUSE_ARGS(args); u64 attr_version; attr_version = fuse_get_attr_version(fc); memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); /* Directories have separate file-handle space */ if (file && S_ISREG(inode->i_mode)) { struct fuse_file *ff = file->private_data; inarg.getattr_flags |= FUSE_GETATTR_FH; inarg.fh = ff->fh; } args.in.h.opcode = FUSE_GETATTR; args.in.h.nodeid = get_node_id(inode); args.in.numargs = 1; args.in.args[0].size = sizeof(inarg); args.in.args[0].value = &inarg; args.out.numargs = 1; args.out.args[0].size = sizeof(outarg); args.out.args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (!err) { if (fuse_invalid_attr(&outarg.attr) || (inode->i_mode ^ outarg.attr.mode) & S_IFMT) { fuse_make_bad(inode); err = -EIO; } else { fuse_change_attributes(inode, &outarg.attr, attr_timeout(&outarg), attr_version); if (stat) fuse_fillattr(inode, &outarg.attr, stat); } } return err; } static int fuse_update_get_attr(struct inode *inode, struct file *file, struct kstat *stat, unsigned int flags) { struct fuse_inode *fi = get_fuse_inode(inode); int err = 0; bool sync; if (flags & AT_STATX_FORCE_SYNC) sync = true; else if (flags & AT_STATX_DONT_SYNC) sync = false; else sync = time_before64(fi->i_time, get_jiffies_64()); if (sync) { forget_all_cached_acls(inode); err = fuse_do_getattr(inode, stat, file); } else if (stat) { generic_fillattr(inode, stat); stat->mode = fi->orig_i_mode; stat->ino = fi->orig_ino; } return err; } int fuse_update_attributes(struct inode *inode, struct file *file) { return fuse_update_get_attr(inode, file, NULL, 0); } int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, u64 child_nodeid, struct qstr *name) { int err = -ENOTDIR; struct inode *parent; struct dentry *dir; struct dentry *entry; parent = ilookup5(sb, parent_nodeid, fuse_inode_eq, &parent_nodeid); if (!parent) return -ENOENT; inode_lock_nested(parent, I_MUTEX_PARENT); if (!S_ISDIR(parent->i_mode)) goto unlock; err = -ENOENT; dir = d_find_alias(parent); if (!dir) goto unlock; name->hash = full_name_hash(dir, name->name, name->len); entry = d_lookup(dir, name); dput(dir); if (!entry) goto unlock; fuse_invalidate_attr(parent); fuse_invalidate_entry(entry); if (child_nodeid != 0 && d_really_is_positive(entry)) { inode_lock(d_inode(entry)); if (get_node_id(d_inode(entry)) != child_nodeid) { err = -ENOENT; goto badentry; } if (d_mountpoint(entry)) { err = -EBUSY; goto badentry; } if (d_is_dir(entry)) { shrink_dcache_parent(entry); if (!simple_empty(entry)) { err = -ENOTEMPTY; goto badentry; } d_inode(entry)->i_flags |= S_DEAD; } dont_mount(entry); clear_nlink(d_inode(entry)); err = 0; badentry: inode_unlock(d_inode(entry)); if (!err) d_delete(entry); } else { err = 0; } dput(entry); unlock: inode_unlock(parent); iput(parent); return err; } /* * Calling into a user-controlled filesystem gives the filesystem * daemon ptrace-like capabilities over the current process. This * means, that the filesystem daemon is able to record the exact * filesystem operations performed, and can also control the behavior * of the requester process in otherwise impossible ways. For example * it can delay the operation for arbitrary length of time allowing * DoS against the requester. * * For this reason only those processes can call into the filesystem, * for which the owner of the mount has ptrace privilege. This * excludes processes started by other users, suid or sgid processes. */ int fuse_allow_current_process(struct fuse_conn *fc) { const struct cred *cred; if (fc->allow_other) return current_in_userns(fc->user_ns); cred = current_cred(); if (uid_eq(cred->euid, fc->user_id) && uid_eq(cred->suid, fc->user_id) && uid_eq(cred->uid, fc->user_id) && gid_eq(cred->egid, fc->group_id) && gid_eq(cred->sgid, fc->group_id) && gid_eq(cred->gid, fc->group_id)) return 1; return 0; } static int fuse_access(struct inode *inode, int mask) { struct fuse_conn *fc = get_fuse_conn(inode); FUSE_ARGS(args); struct fuse_access_in inarg; int err; BUG_ON(mask & MAY_NOT_BLOCK); if (fc->no_access) return 0; memset(&inarg, 0, sizeof(inarg)); inarg.mask = mask & (MAY_READ | MAY_WRITE | MAY_EXEC); args.in.h.opcode = FUSE_ACCESS; args.in.h.nodeid = get_node_id(inode); args.in.numargs = 1; args.in.args[0].size = sizeof(inarg); args.in.args[0].value = &inarg; err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { fc->no_access = 1; err = 0; } return err; } static int fuse_perm_getattr(struct inode *inode, int mask) { if (mask & MAY_NOT_BLOCK) return -ECHILD; forget_all_cached_acls(inode); return fuse_do_getattr(inode, NULL, NULL); } /* * Check permission. The two basic access models of FUSE are: * * 1) Local access checking ('default_permissions' mount option) based * on file mode. This is the plain old disk filesystem permission * modell. * * 2) "Remote" access checking, where server is responsible for * checking permission in each inode operation. An exception to this * is if ->permission() was invoked from sys_access() in which case an * access request is sent. Execute permission is still checked * locally based on file mode. */ static int fuse_permission(struct inode *inode, int mask) { struct fuse_conn *fc = get_fuse_conn(inode); bool refreshed = false; int err = 0; if (fuse_is_bad(inode)) return -EIO; if (!fuse_allow_current_process(fc)) return -EACCES; /* * If attributes are needed, refresh them before proceeding */ if (fc->default_permissions || ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) { struct fuse_inode *fi = get_fuse_inode(inode); if (time_before64(fi->i_time, get_jiffies_64())) { refreshed = true; err = fuse_perm_getattr(inode, mask); if (err) return err; } } if (fc->default_permissions) { err = generic_permission(inode, mask); /* If permission is denied, try to refresh file attributes. This is also needed, because the root node will at first have no permissions */ if (err == -EACCES && !refreshed) { err = fuse_perm_getattr(inode, mask); if (!err) err = generic_permission(inode, mask); } /* Note: the opposite of the above test does not exist. So if permissions are revoked this won't be noticed immediately, only after the attribute timeout has expired */ } else if (mask & (MAY_ACCESS | MAY_CHDIR)) { err = fuse_access(inode, mask); } else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) { if (!(inode->i_mode & S_IXUGO)) { if (refreshed) return -EACCES; err = fuse_perm_getattr(inode, mask); if (!err && !(inode->i_mode & S_IXUGO)) return -EACCES; } } return err; } static int parse_dirfile(char *buf, size_t nbytes, struct file *file, struct dir_context *ctx) { while (nbytes >= FUSE_NAME_OFFSET) { struct fuse_dirent *dirent = (struct fuse_dirent *) buf; size_t reclen = FUSE_DIRENT_SIZE(dirent); if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) return -EIO; if (reclen > nbytes) break; if (memchr(dirent->name, '/', dirent->namelen) != NULL) return -EIO; if (!dir_emit(ctx, dirent->name, dirent->namelen, dirent->ino, dirent->type)) break; buf += reclen; nbytes -= reclen; ctx->pos = dirent->off; } return 0; } static int fuse_direntplus_link(struct file *file, struct fuse_direntplus *direntplus, u64 attr_version) { struct fuse_entry_out *o = &direntplus->entry_out; struct fuse_dirent *dirent = &direntplus->dirent; struct dentry *parent = file->f_path.dentry; struct qstr name = QSTR_INIT(dirent->name, dirent->namelen); struct dentry *dentry; struct dentry *alias; struct inode *dir = d_inode(parent); struct fuse_conn *fc; struct inode *inode; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); if (!o->nodeid) { /* * Unlike in the case of fuse_lookup, zero nodeid does not mean * ENOENT. Instead, it only means the userspace filesystem did * not want to return attributes/handle for this entry. * * So do nothing. */ return 0; } if (name.name[0] == '.') { /* * We could potentially refresh the attributes of the directory * and its parent? */ if (name.len == 1) return 0; if (name.name[1] == '.' && name.len == 2) return 0; } if (invalid_nodeid(o->nodeid)) return -EIO; if (fuse_invalid_attr(&o->attr)) return -EIO; fc = get_fuse_conn(dir); name.hash = full_name_hash(parent, name.name, name.len); dentry = d_lookup(parent, &name); if (!dentry) { retry: dentry = d_alloc_parallel(parent, &name, &wq); if (IS_ERR(dentry)) return PTR_ERR(dentry); } if (!d_in_lookup(dentry)) { struct fuse_inode *fi; inode = d_inode(dentry); if (!inode || get_node_id(inode) != o->nodeid || ((o->attr.mode ^ inode->i_mode) & S_IFMT)) { d_invalidate(dentry); dput(dentry); goto retry; } if (fuse_is_bad(inode)) { dput(dentry); return -EIO; } fi = get_fuse_inode(inode); spin_lock(&fc->lock); fi->nlookup++; spin_unlock(&fc->lock); forget_all_cached_acls(inode); fuse_change_attributes(inode, &o->attr, entry_attr_timeout(o), attr_version); /* * The other branch comes via fuse_iget() * which bumps nlookup inside */ } else { inode = fuse_iget(dir->i_sb, o->nodeid, o->generation, &o->attr, entry_attr_timeout(o), attr_version); if (!inode) inode = ERR_PTR(-ENOMEM); alias = d_splice_alias(inode, dentry); d_lookup_done(dentry); if (alias) { dput(dentry); dentry = alias; } if (IS_ERR(dentry)) return PTR_ERR(dentry); } if (fc->readdirplus_auto) set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state); fuse_change_entry_timeout(dentry, o); dput(dentry); return 0; } static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, struct dir_context *ctx, u64 attr_version) { struct fuse_direntplus *direntplus; struct fuse_dirent *dirent; size_t reclen; int over = 0; int ret; while (nbytes >= FUSE_NAME_OFFSET_DIRENTPLUS) { direntplus = (struct fuse_direntplus *) buf; dirent = &direntplus->dirent; reclen = FUSE_DIRENTPLUS_SIZE(direntplus); if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) return -EIO; if (reclen > nbytes) break; if (memchr(dirent->name, '/', dirent->namelen) != NULL) return -EIO; if (!over) { /* We fill entries into dstbuf only as much as it can hold. But we still continue iterating over remaining entries to link them. If not, we need to send a FORGET for each of those which we did not link. */ over = !dir_emit(ctx, dirent->name, dirent->namelen, dirent->ino, dirent->type); if (!over) ctx->pos = dirent->off; } buf += reclen; nbytes -= reclen; ret = fuse_direntplus_link(file, direntplus, attr_version); if (ret) fuse_force_forget(file, direntplus->entry_out.nodeid); } return 0; } static int fuse_readdir(struct file *file, struct dir_context *ctx) { int plus, err; size_t nbytes; struct page *page; struct inode *inode = file_inode(file); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_req *req; u64 attr_version = 0; bool locked; if (fuse_is_bad(inode)) return -EIO; req = fuse_get_req(fc, 1); if (IS_ERR(req)) return PTR_ERR(req); page = alloc_page(GFP_KERNEL); if (!page) { fuse_put_request(fc, req); return -ENOMEM; } plus = fuse_use_readdirplus(inode, ctx); req->out.argpages = 1; req->num_pages = 1; req->pages[0] = page; req->page_descs[0].length = PAGE_SIZE; if (plus) { attr_version = fuse_get_attr_version(fc); fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, FUSE_READDIRPLUS); } else { fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, FUSE_READDIR); } locked = fuse_lock_inode(inode); fuse_request_send(fc, req); fuse_unlock_inode(inode, locked); nbytes = req->out.args[0].size; err = req->out.h.error; fuse_put_request(fc, req); if (!err) { if (plus) { err = parse_dirplusfile(page_address(page), nbytes, file, ctx, attr_version); } else { err = parse_dirfile(page_address(page), nbytes, file, ctx); } } __free_page(page); fuse_invalidate_atime(inode); return err; } static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct fuse_conn *fc = get_fuse_conn(inode); FUSE_ARGS(args); char *link; ssize_t ret; if (!dentry) return ERR_PTR(-ECHILD); if (fuse_is_bad(inode)) return ERR_PTR(-EIO); link = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!link) return ERR_PTR(-ENOMEM); args.in.h.opcode = FUSE_READLINK; args.in.h.nodeid = get_node_id(inode); args.out.argvar = 1; args.out.numargs = 1; args.out.args[0].size = PAGE_SIZE - 1; args.out.args[0].value = link; ret = fuse_simple_request(fc, &args); if (ret < 0) { kfree(link); link = ERR_PTR(ret); } else { link[ret] = '\0'; set_delayed_call(done, kfree_link, link); } fuse_invalidate_atime(inode); return link; } static int fuse_dir_open(struct inode *inode, struct file *file) { return fuse_open_common(inode, file, true); } static int fuse_dir_release(struct inode *inode, struct file *file) { fuse_release_common(file, true); return 0; } static int fuse_dir_fsync(struct file *file, loff_t start, loff_t end, int datasync) { return fuse_fsync_common(file, start, end, datasync, 1); } static long fuse_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host); /* FUSE_IOCTL_DIR only supported for API version >= 7.18 */ if (fc->minor < 18) return -ENOTTY; return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_DIR); } static long fuse_dir_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host); if (fc->minor < 18) return -ENOTTY; return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR); } static bool update_mtime(unsigned ivalid, bool trust_local_mtime) { /* Always update if mtime is explicitly set */ if (ivalid & ATTR_MTIME_SET) return true; /* Or if kernel i_mtime is the official one */ if (trust_local_mtime) return true; /* If it's an open(O_TRUNC) or an ftruncate(), don't update */ if ((ivalid & ATTR_SIZE) && (ivalid & (ATTR_OPEN | ATTR_FILE))) return false; /* In all other cases update */ return true; } static void iattr_to_fattr(struct fuse_conn *fc, struct iattr *iattr, struct fuse_setattr_in *arg, bool trust_local_cmtime) { unsigned ivalid = iattr->ia_valid; if (ivalid & ATTR_MODE) arg->valid |= FATTR_MODE, arg->mode = iattr->ia_mode; if (ivalid & ATTR_UID) arg->valid |= FATTR_UID, arg->uid = from_kuid(fc->user_ns, iattr->ia_uid); if (ivalid & ATTR_GID) arg->valid |= FATTR_GID, arg->gid = from_kgid(fc->user_ns, iattr->ia_gid); if (ivalid & ATTR_SIZE) arg->valid |= FATTR_SIZE, arg->size = iattr->ia_size; if (ivalid & ATTR_ATIME) { arg->valid |= FATTR_ATIME; arg->atime = iattr->ia_atime.tv_sec; arg->atimensec = iattr->ia_atime.tv_nsec; if (!(ivalid & ATTR_ATIME_SET)) arg->valid |= FATTR_ATIME_NOW; } if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_cmtime)) { arg->valid |= FATTR_MTIME; arg->mtime = iattr->ia_mtime.tv_sec; arg->mtimensec = iattr->ia_mtime.tv_nsec; if (!(ivalid & ATTR_MTIME_SET) && !trust_local_cmtime) arg->valid |= FATTR_MTIME_NOW; } if ((ivalid & ATTR_CTIME) && trust_local_cmtime) { arg->valid |= FATTR_CTIME; arg->ctime = iattr->ia_ctime.tv_sec; arg->ctimensec = iattr->ia_ctime.tv_nsec; } } /* * Prevent concurrent writepages on inode * * This is done by adding a negative bias to the inode write counter * and waiting for all pending writes to finish. */ void fuse_set_nowrite(struct inode *inode) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); BUG_ON(!inode_is_locked(inode)); spin_lock(&fc->lock); BUG_ON(fi->writectr < 0); fi->writectr += FUSE_NOWRITE; spin_unlock(&fc->lock); wait_event(fi->page_waitq, fi->writectr == FUSE_NOWRITE); } /* * Allow writepages on inode * * Remove the bias from the writecounter and send any queued * writepages. */ static void __fuse_release_nowrite(struct inode *inode) { struct fuse_inode *fi = get_fuse_inode(inode); BUG_ON(fi->writectr != FUSE_NOWRITE); fi->writectr = 0; fuse_flush_writepages(inode); } void fuse_release_nowrite(struct inode *inode) { struct fuse_conn *fc = get_fuse_conn(inode); spin_lock(&fc->lock); __fuse_release_nowrite(inode); spin_unlock(&fc->lock); } static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_args *args, struct inode *inode, struct fuse_setattr_in *inarg_p, struct fuse_attr_out *outarg_p) { args->in.h.opcode = FUSE_SETATTR; args->in.h.nodeid = get_node_id(inode); args->in.numargs = 1; args->in.args[0].size = sizeof(*inarg_p); args->in.args[0].value = inarg_p; args->out.numargs = 1; args->out.args[0].size = sizeof(*outarg_p); args->out.args[0].value = outarg_p; } /* * Flush inode->i_mtime to the server */ int fuse_flush_times(struct inode *inode, struct fuse_file *ff) { struct fuse_conn *fc = get_fuse_conn(inode); FUSE_ARGS(args); struct fuse_setattr_in inarg; struct fuse_attr_out outarg; memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); inarg.valid = FATTR_MTIME; inarg.mtime = inode->i_mtime.tv_sec; inarg.mtimensec = inode->i_mtime.tv_nsec; if (fc->minor >= 23) { inarg.valid |= FATTR_CTIME; inarg.ctime = inode->i_ctime.tv_sec; inarg.ctimensec = inode->i_ctime.tv_nsec; } if (ff) { inarg.valid |= FATTR_FH; inarg.fh = ff->fh; } fuse_setattr_fill(fc, &args, inode, &inarg, &outarg); return fuse_simple_request(fc, &args); } /* * Set attributes, and at the same time refresh them. * * Truncation is slightly complicated, because the 'truncate' request * may fail, in which case we don't want to touch the mapping. * vmtruncate() doesn't allow for this case, so do the rlimit checking * and the actual truncation by hand. */ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, struct file *file) { struct inode *inode = d_inode(dentry); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); FUSE_ARGS(args); struct fuse_setattr_in inarg; struct fuse_attr_out outarg; bool is_truncate = false; bool is_wb = fc->writeback_cache; loff_t oldsize; int err; bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode); if (!fc->default_permissions) attr->ia_valid |= ATTR_FORCE; err = setattr_prepare(dentry, attr); if (err) return err; if (attr->ia_valid & ATTR_OPEN) { /* This is coming from open(..., ... | O_TRUNC); */ WARN_ON(!(attr->ia_valid & ATTR_SIZE)); WARN_ON(attr->ia_size != 0); if (fc->atomic_o_trunc) { /* * No need to send request to userspace, since actual * truncation has already been done by OPEN. But still * need to truncate page cache. */ i_size_write(inode, 0); truncate_pagecache(inode, 0); return 0; } file = NULL; } if (attr->ia_valid & ATTR_SIZE) is_truncate = true; /* Flush dirty data/metadata before non-truncate SETATTR */ if (is_wb && S_ISREG(inode->i_mode) && attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_MTIME_SET | ATTR_TIMES_SET)) { err = write_inode_now(inode, true); if (err) return err; fuse_set_nowrite(inode); fuse_release_nowrite(inode); } if (is_truncate) { fuse_set_nowrite(inode); set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); if (trust_local_cmtime && attr->ia_size != inode->i_size) attr->ia_valid |= ATTR_MTIME | ATTR_CTIME; } memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); iattr_to_fattr(fc, attr, &inarg, trust_local_cmtime); if (file) { struct fuse_file *ff = file->private_data; inarg.valid |= FATTR_FH; inarg.fh = ff->fh; } if (attr->ia_valid & ATTR_SIZE) { /* For mandatory locking in truncate */ inarg.valid |= FATTR_LOCKOWNER; inarg.lock_owner = fuse_lock_owner_id(fc, current->files); } fuse_setattr_fill(fc, &args, inode, &inarg, &outarg); err = fuse_simple_request(fc, &args); if (err) { if (err == -EINTR) fuse_invalidate_attr(inode); goto error; } if (fuse_invalid_attr(&outarg.attr) || (inode->i_mode ^ outarg.attr.mode) & S_IFMT) { fuse_make_bad(inode); err = -EIO; goto error; } spin_lock(&fc->lock); /* the kernel maintains i_mtime locally */ if (trust_local_cmtime) { if (attr->ia_valid & ATTR_MTIME) inode->i_mtime = attr->ia_mtime; if (attr->ia_valid & ATTR_CTIME) inode->i_ctime = attr->ia_ctime; /* FIXME: clear I_DIRTY_SYNC? */ } fuse_change_attributes_common(inode, &outarg.attr, attr_timeout(&outarg)); oldsize = inode->i_size; /* see the comment in fuse_change_attributes() */ if (!is_wb || is_truncate || !S_ISREG(inode->i_mode)) i_size_write(inode, outarg.attr.size); if (is_truncate) { /* NOTE: this may release/reacquire fc->lock */ __fuse_release_nowrite(inode); } spin_unlock(&fc->lock); /* * Only call invalidate_inode_pages2() after removing * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock. */ if ((is_truncate || !is_wb) && S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { truncate_pagecache(inode, outarg.attr.size); invalidate_inode_pages2(inode->i_mapping); } clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); return 0; error: if (is_truncate) fuse_release_nowrite(inode); clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); return err; } static int fuse_setattr(struct dentry *entry, struct iattr *attr) { struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); struct file *file = (attr->ia_valid & ATTR_FILE) ? attr->ia_file : NULL; int ret; if (fuse_is_bad(inode)) return -EIO; if (!fuse_allow_current_process(get_fuse_conn(inode))) return -EACCES; if (attr->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) { attr->ia_valid &= ~(ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_MODE); /* * The only sane way to reliably kill suid/sgid is to do it in * the userspace filesystem * * This should be done on write(), truncate() and chown(). */ if (!fc->handle_killpriv) { /* * ia_mode calculation may have used stale i_mode. * Refresh and recalculate. */ ret = fuse_do_getattr(inode, NULL, file); if (ret) return ret; attr->ia_mode = inode->i_mode; if (inode->i_mode & S_ISUID) { attr->ia_valid |= ATTR_MODE; attr->ia_mode &= ~S_ISUID; } if ((inode->i_mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { attr->ia_valid |= ATTR_MODE; attr->ia_mode &= ~S_ISGID; } } } if (!attr->ia_valid) return 0; ret = fuse_do_setattr(entry, attr, file); if (!ret) { /* * If filesystem supports acls it may have updated acl xattrs in * the filesystem, so forget cached acls for the inode. */ if (fc->posix_acl) forget_all_cached_acls(inode); /* Directory mode changed, may need to revalidate access */ if (d_is_dir(entry) && (attr->ia_valid & ATTR_MODE)) fuse_invalidate_entry_cache(entry); } return ret; } static int fuse_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); struct fuse_conn *fc = get_fuse_conn(inode); if (fuse_is_bad(inode)) return -EIO; if (!fuse_allow_current_process(fc)) return -EACCES; return fuse_update_get_attr(inode, NULL, stat, flags); } static const struct inode_operations fuse_dir_inode_operations = { .lookup = fuse_lookup, .mkdir = fuse_mkdir, .symlink = fuse_symlink, .unlink = fuse_unlink, .rmdir = fuse_rmdir, .rename = fuse_rename2, .link = fuse_link, .setattr = fuse_setattr, .create = fuse_create, .atomic_open = fuse_atomic_open, .mknod = fuse_mknod, .permission = fuse_permission, .getattr = fuse_getattr, .listxattr = fuse_listxattr, .get_acl = fuse_get_acl, .set_acl = fuse_set_acl, }; static const struct file_operations fuse_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate_shared = fuse_readdir, .open = fuse_dir_open, .release = fuse_dir_release, .fsync = fuse_dir_fsync, .unlocked_ioctl = fuse_dir_ioctl, .compat_ioctl = fuse_dir_compat_ioctl, }; static const struct inode_operations fuse_common_inode_operations = { .setattr = fuse_setattr, .permission = fuse_permission, .getattr = fuse_getattr, .listxattr = fuse_listxattr, .get_acl = fuse_get_acl, .set_acl = fuse_set_acl, }; static const struct inode_operations fuse_symlink_inode_operations = { .setattr = fuse_setattr, .get_link = fuse_get_link, .getattr = fuse_getattr, .listxattr = fuse_listxattr, }; void fuse_init_common(struct inode *inode) { inode->i_op = &fuse_common_inode_operations; } void fuse_init_dir(struct inode *inode) { inode->i_op = &fuse_dir_inode_operations; inode->i_fop = &fuse_dir_operations; } void fuse_init_symlink(struct inode *inode) { inode->i_op = &fuse_symlink_inode_operations; } |
1 1 1 1 1 1 1 1 1 1 1 1 1 1 22 22 22 18 22 22 3 10 6 10 15 15 15 15 15 15 15 15 7 7 7 7 7 10 22 22 22 1 1 1 1 1 5 1 1 1 1 2 5 5 6 5 5 2 5 5 2 5 5 3 2 2 2 2 1 1 1 4 3 4 4 1 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 | /* * Input driver to ExplorerPS/2 device driver module. * * Copyright (c) 1999-2002 Vojtech Pavlik * Copyright (c) 2004 Dmitry Torokhov * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as published by * the Free Software Foundation. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define MOUSEDEV_MINOR_BASE 32 #define MOUSEDEV_MINORS 31 #define MOUSEDEV_MIX 63 #include <linux/bitops.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/module.h> #include <linux/init.h> #include <linux/input.h> #include <linux/random.h> #include <linux/major.h> #include <linux/device.h> #include <linux/cdev.h> #include <linux/kernel.h> MODULE_AUTHOR("Vojtech Pavlik <vojtech@ucw.cz>"); MODULE_DESCRIPTION("Mouse (ExplorerPS/2) device interfaces"); MODULE_LICENSE("GPL"); #ifndef CONFIG_INPUT_MOUSEDEV_SCREEN_X #define CONFIG_INPUT_MOUSEDEV_SCREEN_X 1024 #endif #ifndef CONFIG_INPUT_MOUSEDEV_SCREEN_Y #define CONFIG_INPUT_MOUSEDEV_SCREEN_Y 768 #endif static int xres = CONFIG_INPUT_MOUSEDEV_SCREEN_X; module_param(xres, uint, 0644); MODULE_PARM_DESC(xres, "Horizontal screen resolution"); static int yres = CONFIG_INPUT_MOUSEDEV_SCREEN_Y; module_param(yres, uint, 0644); MODULE_PARM_DESC(yres, "Vertical screen resolution"); static unsigned tap_time = 200; module_param(tap_time, uint, 0644); MODULE_PARM_DESC(tap_time, "Tap time for touchpads in absolute mode (msecs)"); struct mousedev_hw_data { int dx, dy, dz; int x, y; int abs_event; unsigned long buttons; }; struct mousedev { int open; struct input_handle handle; wait_queue_head_t wait; struct list_head client_list; spinlock_t client_lock; /* protects client_list */ struct mutex mutex; struct device dev; struct cdev cdev; bool exist; struct list_head mixdev_node; bool opened_by_mixdev; struct mousedev_hw_data packet; unsigned int pkt_count; int old_x[4], old_y[4]; int frac_dx, frac_dy; unsigned long touch; int (*open_device)(struct mousedev *mousedev); void (*close_device)(struct mousedev *mousedev); }; enum mousedev_emul { MOUSEDEV_EMUL_PS2, MOUSEDEV_EMUL_IMPS, MOUSEDEV_EMUL_EXPS }; struct mousedev_motion { int dx, dy, dz; unsigned long buttons; }; #define PACKET_QUEUE_LEN 16 struct mousedev_client { struct fasync_struct *fasync; struct mousedev *mousedev; struct list_head node; struct mousedev_motion packets[PACKET_QUEUE_LEN]; unsigned int head, tail; spinlock_t packet_lock; int pos_x, pos_y; u8 ps2[6]; unsigned char ready, buffer, bufsiz; unsigned char imexseq, impsseq; enum mousedev_emul mode; unsigned long last_buttons; }; #define MOUSEDEV_SEQ_LEN 6 static unsigned char mousedev_imps_seq[] = { 0xf3, 200, 0xf3, 100, 0xf3, 80 }; static unsigned char mousedev_imex_seq[] = { 0xf3, 200, 0xf3, 200, 0xf3, 80 }; static struct mousedev *mousedev_mix; static LIST_HEAD(mousedev_mix_list); #define fx(i) (mousedev->old_x[(mousedev->pkt_count - (i)) & 03]) #define fy(i) (mousedev->old_y[(mousedev->pkt_count - (i)) & 03]) static void mousedev_touchpad_event(struct input_dev *dev, struct mousedev *mousedev, unsigned int code, int value) { int size, tmp; enum { FRACTION_DENOM = 128 }; switch (code) { case ABS_X: fx(0) = value; if (mousedev->touch && mousedev->pkt_count >= 2) { size = input_abs_get_max(dev, ABS_X) - input_abs_get_min(dev, ABS_X); if (size == 0) size = 256 * 2; tmp = ((value - fx(2)) * 256 * FRACTION_DENOM) / size; tmp += mousedev->frac_dx; mousedev->packet.dx = tmp / FRACTION_DENOM; mousedev->frac_dx = tmp - mousedev->packet.dx * FRACTION_DENOM; } break; case ABS_Y: fy(0) = value; if (mousedev->touch && mousedev->pkt_count >= 2) { /* use X size for ABS_Y to keep the same scale */ size = input_abs_get_max(dev, ABS_X) - input_abs_get_min(dev, ABS_X); if (size == 0) size = 256 * 2; tmp = -((value - fy(2)) * 256 * FRACTION_DENOM) / size; tmp += mousedev->frac_dy; mousedev->packet.dy = tmp / FRACTION_DENOM; mousedev->frac_dy = tmp - mousedev->packet.dy * FRACTION_DENOM; } break; } } static void mousedev_abs_event(struct input_dev *dev, struct mousedev *mousedev, unsigned int code, int value) { int min, max, size; switch (code) { case ABS_X: min = input_abs_get_min(dev, ABS_X); max = input_abs_get_max(dev, ABS_X); size = max - min; if (size == 0) size = xres ? : 1; value = clamp(value, min, max); mousedev->packet.x = ((value - min) * xres) / size; mousedev->packet.abs_event = 1; break; case ABS_Y: min = input_abs_get_min(dev, ABS_Y); max = input_abs_get_max(dev, ABS_Y); size = max - min; if (size == 0) size = yres ? : 1; value = clamp(value, min, max); mousedev->packet.y = yres - ((value - min) * yres) / size; mousedev->packet.abs_event = 1; break; } } static void mousedev_rel_event(struct mousedev *mousedev, unsigned int code, int value) { switch (code) { case REL_X: mousedev->packet.dx += value; break; case REL_Y: mousedev->packet.dy -= value; break; case REL_WHEEL: mousedev->packet.dz -= value; break; } } static void mousedev_key_event(struct mousedev *mousedev, unsigned int code, int value) { int index; switch (code) { case BTN_TOUCH: case BTN_0: case BTN_LEFT: index = 0; break; case BTN_STYLUS: case BTN_1: case BTN_RIGHT: index = 1; break; case BTN_2: case BTN_FORWARD: case BTN_STYLUS2: case BTN_MIDDLE: index = 2; break; case BTN_3: case BTN_BACK: case BTN_SIDE: index = 3; break; case BTN_4: case BTN_EXTRA: index = 4; break; default: return; } if (value) { set_bit(index, &mousedev->packet.buttons); set_bit(index, &mousedev_mix->packet.buttons); } else { clear_bit(index, &mousedev->packet.buttons); clear_bit(index, &mousedev_mix->packet.buttons); } } static void mousedev_notify_readers(struct mousedev *mousedev, struct mousedev_hw_data *packet) { struct mousedev_client *client; struct mousedev_motion *p; unsigned int new_head; int wake_readers = 0; rcu_read_lock(); list_for_each_entry_rcu(client, &mousedev->client_list, node) { /* Just acquire the lock, interrupts already disabled */ spin_lock(&client->packet_lock); p = &client->packets[client->head]; if (client->ready && p->buttons != mousedev->packet.buttons) { new_head = (client->head + 1) % PACKET_QUEUE_LEN; if (new_head != client->tail) { p = &client->packets[client->head = new_head]; memset(p, 0, sizeof(struct mousedev_motion)); } } if (packet->abs_event) { p->dx += packet->x - client->pos_x; p->dy += packet->y - client->pos_y; client->pos_x = packet->x; client->pos_y = packet->y; } client->pos_x += packet->dx; client->pos_x = clamp_val(client->pos_x, 0, xres); client->pos_y += packet->dy; client->pos_y = clamp_val(client->pos_y, 0, yres); p->dx += packet->dx; p->dy += packet->dy; p->dz += packet->dz; p->buttons = mousedev->packet.buttons; if (p->dx || p->dy || p->dz || p->buttons != client->last_buttons) client->ready = 1; spin_unlock(&client->packet_lock); if (client->ready) { kill_fasync(&client->fasync, SIGIO, POLL_IN); wake_readers = 1; } } rcu_read_unlock(); if (wake_readers) wake_up_interruptible(&mousedev->wait); } static void mousedev_touchpad_touch(struct mousedev *mousedev, int value) { if (!value) { if (mousedev->touch && time_before(jiffies, mousedev->touch + msecs_to_jiffies(tap_time))) { /* * Toggle left button to emulate tap. * We rely on the fact that mousedev_mix always has 0 * motion packet so we won't mess current position. */ set_bit(0, &mousedev->packet.buttons); set_bit(0, &mousedev_mix->packet.buttons); mousedev_notify_readers(mousedev, &mousedev_mix->packet); mousedev_notify_readers(mousedev_mix, &mousedev_mix->packet); clear_bit(0, &mousedev->packet.buttons); clear_bit(0, &mousedev_mix->packet.buttons); } mousedev->touch = mousedev->pkt_count = 0; mousedev->frac_dx = 0; mousedev->frac_dy = 0; } else if (!mousedev->touch) mousedev->touch = jiffies; } static void mousedev_event(struct input_handle *handle, unsigned int type, unsigned int code, int value) { struct mousedev *mousedev = handle->private; switch (type) { case EV_ABS: /* Ignore joysticks */ if (test_bit(BTN_TRIGGER, handle->dev->keybit)) return; if (test_bit(BTN_TOOL_FINGER, handle->dev->keybit)) mousedev_touchpad_event(handle->dev, mousedev, code, value); else mousedev_abs_event(handle->dev, mousedev, code, value); break; case EV_REL: mousedev_rel_event(mousedev, code, value); break; case EV_KEY: if (value != 2) { if (code == BTN_TOUCH && test_bit(BTN_TOOL_FINGER, handle->dev->keybit)) mousedev_touchpad_touch(mousedev, value); else mousedev_key_event(mousedev, code, value); } break; case EV_SYN: if (code == SYN_REPORT) { if (mousedev->touch) { mousedev->pkt_count++; /* * Input system eats duplicate events, * but we need all of them to do correct * averaging so apply present one forward */ fx(0) = fx(1); fy(0) = fy(1); } mousedev_notify_readers(mousedev, &mousedev->packet); mousedev_notify_readers(mousedev_mix, &mousedev->packet); mousedev->packet.dx = mousedev->packet.dy = mousedev->packet.dz = 0; mousedev->packet.abs_event = 0; } break; } } static int mousedev_fasync(int fd, struct file *file, int on) { struct mousedev_client *client = file->private_data; return fasync_helper(fd, file, on, &client->fasync); } static void mousedev_free(struct device *dev) { struct mousedev *mousedev = container_of(dev, struct mousedev, dev); input_put_device(mousedev->handle.dev); kfree(mousedev); } static int mousedev_open_device(struct mousedev *mousedev) { int retval; retval = mutex_lock_interruptible(&mousedev->mutex); if (retval) return retval; if (!mousedev->exist) retval = -ENODEV; else if (!mousedev->open++) { retval = input_open_device(&mousedev->handle); if (retval) mousedev->open--; } mutex_unlock(&mousedev->mutex); return retval; } static void mousedev_close_device(struct mousedev *mousedev) { mutex_lock(&mousedev->mutex); if (mousedev->exist && !--mousedev->open) input_close_device(&mousedev->handle); mutex_unlock(&mousedev->mutex); } /* * Open all available devices so they can all be multiplexed in one. * stream. Note that this function is called with mousedev_mix->mutex * held. */ static int mixdev_open_devices(struct mousedev *mixdev) { int error; error = mutex_lock_interruptible(&mixdev->mutex); if (error) return error; if (!mixdev->open++) { struct mousedev *mousedev; list_for_each_entry(mousedev, &mousedev_mix_list, mixdev_node) { if (!mousedev->opened_by_mixdev) { if (mousedev_open_device(mousedev)) continue; mousedev->opened_by_mixdev = true; } } } mutex_unlock(&mixdev->mutex); return 0; } /* * Close all devices that were opened as part of multiplexed * device. Note that this function is called with mousedev_mix->mutex * held. */ static void mixdev_close_devices(struct mousedev *mixdev) { mutex_lock(&mixdev->mutex); if (!--mixdev->open) { struct mousedev *mousedev; list_for_each_entry(mousedev, &mousedev_mix_list, mixdev_node) { if (mousedev->opened_by_mixdev) { mousedev->opened_by_mixdev = false; mousedev_close_device(mousedev); } } } mutex_unlock(&mixdev->mutex); } static void mousedev_attach_client(struct mousedev *mousedev, struct mousedev_client *client) { spin_lock(&mousedev->client_lock); list_add_tail_rcu(&client->node, &mousedev->client_list); spin_unlock(&mousedev->client_lock); } static void mousedev_detach_client(struct mousedev *mousedev, struct mousedev_client *client) { spin_lock(&mousedev->client_lock); list_del_rcu(&client->node); spin_unlock(&mousedev->client_lock); synchronize_rcu(); } static int mousedev_release(struct inode *inode, struct file *file) { struct mousedev_client *client = file->private_data; struct mousedev *mousedev = client->mousedev; mousedev_detach_client(mousedev, client); kfree(client); mousedev->close_device(mousedev); return 0; } static int mousedev_open(struct inode *inode, struct file *file) { struct mousedev_client *client; struct mousedev *mousedev; int error; #ifdef CONFIG_INPUT_MOUSEDEV_PSAUX if (imajor(inode) == MISC_MAJOR) mousedev = mousedev_mix; else #endif mousedev = container_of(inode->i_cdev, struct mousedev, cdev); client = kzalloc(sizeof(struct mousedev_client), GFP_KERNEL); if (!client) return -ENOMEM; spin_lock_init(&client->packet_lock); client->pos_x = xres / 2; client->pos_y = yres / 2; client->mousedev = mousedev; mousedev_attach_client(mousedev, client); error = mousedev->open_device(mousedev); if (error) goto err_free_client; file->private_data = client; nonseekable_open(inode, file); return 0; err_free_client: mousedev_detach_client(mousedev, client); kfree(client); return error; } static void mousedev_packet(struct mousedev_client *client, u8 *ps2_data) { struct mousedev_motion *p = &client->packets[client->tail]; s8 dx, dy, dz; dx = clamp_val(p->dx, -127, 127); p->dx -= dx; dy = clamp_val(p->dy, -127, 127); p->dy -= dy; ps2_data[0] = BIT(3); ps2_data[0] |= ((dx & BIT(7)) >> 3) | ((dy & BIT(7)) >> 2); ps2_data[0] |= p->buttons & 0x07; ps2_data[1] = dx; ps2_data[2] = dy; switch (client->mode) { case MOUSEDEV_EMUL_EXPS: dz = clamp_val(p->dz, -7, 7); p->dz -= dz; ps2_data[3] = (dz & 0x0f) | ((p->buttons & 0x18) << 1); client->bufsiz = 4; break; case MOUSEDEV_EMUL_IMPS: dz = clamp_val(p->dz, -127, 127); p->dz -= dz; ps2_data[0] |= ((p->buttons & 0x10) >> 3) | ((p->buttons & 0x08) >> 1); ps2_data[3] = dz; client->bufsiz = 4; break; case MOUSEDEV_EMUL_PS2: default: p->dz = 0; ps2_data[0] |= ((p->buttons & 0x10) >> 3) | ((p->buttons & 0x08) >> 1); client->bufsiz = 3; break; } if (!p->dx && !p->dy && !p->dz) { if (client->tail == client->head) { client->ready = 0; client->last_buttons = p->buttons; } else client->tail = (client->tail + 1) % PACKET_QUEUE_LEN; } } static void mousedev_generate_response(struct mousedev_client *client, int command) { client->ps2[0] = 0xfa; /* ACK */ switch (command) { case 0xeb: /* Poll */ mousedev_packet(client, &client->ps2[1]); client->bufsiz++; /* account for leading ACK */ break; case 0xf2: /* Get ID */ switch (client->mode) { case MOUSEDEV_EMUL_PS2: client->ps2[1] = 0; break; case MOUSEDEV_EMUL_IMPS: client->ps2[1] = 3; break; case MOUSEDEV_EMUL_EXPS: client->ps2[1] = 4; break; } client->bufsiz = 2; break; case 0xe9: /* Get info */ client->ps2[1] = 0x60; client->ps2[2] = 3; client->ps2[3] = 200; client->bufsiz = 4; break; case 0xff: /* Reset */ client->impsseq = client->imexseq = 0; client->mode = MOUSEDEV_EMUL_PS2; client->ps2[1] = 0xaa; client->ps2[2] = 0x00; client->bufsiz = 3; break; default: client->bufsiz = 1; break; } client->buffer = client->bufsiz; } static ssize_t mousedev_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { struct mousedev_client *client = file->private_data; unsigned char c; unsigned int i; for (i = 0; i < count; i++) { if (get_user(c, buffer + i)) return -EFAULT; spin_lock_irq(&client->packet_lock); if (c == mousedev_imex_seq[client->imexseq]) { if (++client->imexseq == MOUSEDEV_SEQ_LEN) { client->imexseq = 0; client->mode = MOUSEDEV_EMUL_EXPS; } } else client->imexseq = 0; if (c == mousedev_imps_seq[client->impsseq]) { if (++client->impsseq == MOUSEDEV_SEQ_LEN) { client->impsseq = 0; client->mode = MOUSEDEV_EMUL_IMPS; } } else client->impsseq = 0; mousedev_generate_response(client, c); spin_unlock_irq(&client->packet_lock); cond_resched(); } kill_fasync(&client->fasync, SIGIO, POLL_IN); wake_up_interruptible(&client->mousedev->wait); return count; } static ssize_t mousedev_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos) { struct mousedev_client *client = file->private_data; struct mousedev *mousedev = client->mousedev; u8 data[sizeof(client->ps2)]; int retval = 0; if (!client->ready && !client->buffer && mousedev->exist && (file->f_flags & O_NONBLOCK)) return -EAGAIN; retval = wait_event_interruptible(mousedev->wait, !mousedev->exist || client->ready || client->buffer); if (retval) return retval; if (!mousedev->exist) return -ENODEV; spin_lock_irq(&client->packet_lock); if (!client->buffer && client->ready) { mousedev_packet(client, client->ps2); client->buffer = client->bufsiz; } if (count > client->buffer) count = client->buffer; memcpy(data, client->ps2 + client->bufsiz - client->buffer, count); client->buffer -= count; spin_unlock_irq(&client->packet_lock); if (copy_to_user(buffer, data, count)) return -EFAULT; return count; } /* No kernel lock - fine */ static __poll_t mousedev_poll(struct file *file, poll_table *wait) { struct mousedev_client *client = file->private_data; struct mousedev *mousedev = client->mousedev; __poll_t mask; poll_wait(file, &mousedev->wait, wait); mask = mousedev->exist ? EPOLLOUT | EPOLLWRNORM : EPOLLHUP | EPOLLERR; if (client->ready || client->buffer) mask |= EPOLLIN | EPOLLRDNORM; return mask; } static const struct file_operations mousedev_fops = { .owner = THIS_MODULE, .read = mousedev_read, .write = mousedev_write, .poll = mousedev_poll, .open = mousedev_open, .release = mousedev_release, .fasync = mousedev_fasync, .llseek = noop_llseek, }; /* * Mark device non-existent. This disables writes, ioctls and * prevents new users from opening the device. Already posted * blocking reads will stay, however new ones will fail. */ static void mousedev_mark_dead(struct mousedev *mousedev) { mutex_lock(&mousedev->mutex); mousedev->exist = false; mutex_unlock(&mousedev->mutex); } /* * Wake up users waiting for IO so they can disconnect from * dead device. */ static void mousedev_hangup(struct mousedev *mousedev) { struct mousedev_client *client; spin_lock(&mousedev->client_lock); list_for_each_entry(client, &mousedev->client_list, node) kill_fasync(&client->fasync, SIGIO, POLL_HUP); spin_unlock(&mousedev->client_lock); wake_up_interruptible(&mousedev->wait); } static void mousedev_cleanup(struct mousedev *mousedev) { struct input_handle *handle = &mousedev->handle; mousedev_mark_dead(mousedev); mousedev_hangup(mousedev); /* mousedev is marked dead so no one else accesses mousedev->open */ if (mousedev->open) input_close_device(handle); } static int mousedev_reserve_minor(bool mixdev) { int minor; if (mixdev) { minor = input_get_new_minor(MOUSEDEV_MIX, 1, false); if (minor < 0) pr_err("failed to reserve mixdev minor: %d\n", minor); } else { minor = input_get_new_minor(MOUSEDEV_MINOR_BASE, MOUSEDEV_MINORS, true); if (minor < 0) pr_err("failed to reserve new minor: %d\n", minor); } return minor; } static struct mousedev *mousedev_create(struct input_dev *dev, struct input_handler *handler, bool mixdev) { struct mousedev *mousedev; int minor; int error; minor = mousedev_reserve_minor(mixdev); if (minor < 0) { error = minor; goto err_out; } mousedev = kzalloc(sizeof(struct mousedev), GFP_KERNEL); if (!mousedev) { error = -ENOMEM; goto err_free_minor; } INIT_LIST_HEAD(&mousedev->client_list); INIT_LIST_HEAD(&mousedev->mixdev_node); spin_lock_init(&mousedev->client_lock); mutex_init(&mousedev->mutex); lockdep_set_subclass(&mousedev->mutex, mixdev ? SINGLE_DEPTH_NESTING : 0); init_waitqueue_head(&mousedev->wait); if (mixdev) { dev_set_name(&mousedev->dev, "mice"); mousedev->open_device = mixdev_open_devices; mousedev->close_device = mixdev_close_devices; } else { int dev_no = minor; /* Normalize device number if it falls into legacy range */ if (dev_no < MOUSEDEV_MINOR_BASE + MOUSEDEV_MINORS) dev_no -= MOUSEDEV_MINOR_BASE; dev_set_name(&mousedev->dev, "mouse%d", dev_no); mousedev->open_device = mousedev_open_device; mousedev->close_device = mousedev_close_device; } mousedev->exist = true; mousedev->handle.dev = input_get_device(dev); mousedev->handle.name = dev_name(&mousedev->dev); mousedev->handle.handler = handler; mousedev->handle.private = mousedev; mousedev->dev.class = &input_class; if (dev) mousedev->dev.parent = &dev->dev; mousedev->dev.devt = MKDEV(INPUT_MAJOR, minor); mousedev->dev.release = mousedev_free; device_initialize(&mousedev->dev); if (!mixdev) { error = input_register_handle(&mousedev->handle); if (error) goto err_free_mousedev; } cdev_init(&mousedev->cdev, &mousedev_fops); error = cdev_device_add(&mousedev->cdev, &mousedev->dev); if (error) goto err_cleanup_mousedev; return mousedev; err_cleanup_mousedev: mousedev_cleanup(mousedev); if (!mixdev) input_unregister_handle(&mousedev->handle); err_free_mousedev: put_device(&mousedev->dev); err_free_minor: input_free_minor(minor); err_out: return ERR_PTR(error); } static void mousedev_destroy(struct mousedev *mousedev) { cdev_device_del(&mousedev->cdev, &mousedev->dev); mousedev_cleanup(mousedev); input_free_minor(MINOR(mousedev->dev.devt)); if (mousedev != mousedev_mix) input_unregister_handle(&mousedev->handle); put_device(&mousedev->dev); } static int mixdev_add_device(struct mousedev *mousedev) { int retval; retval = mutex_lock_interruptible(&mousedev_mix->mutex); if (retval) return retval; if (mousedev_mix->open) { retval = mousedev_open_device(mousedev); if (retval) goto out; mousedev->opened_by_mixdev = true; } get_device(&mousedev->dev); list_add_tail(&mousedev->mixdev_node, &mousedev_mix_list); out: mutex_unlock(&mousedev_mix->mutex); return retval; } static void mixdev_remove_device(struct mousedev *mousedev) { mutex_lock(&mousedev_mix->mutex); if (mousedev->opened_by_mixdev) { mousedev->opened_by_mixdev = false; mousedev_close_device(mousedev); } list_del_init(&mousedev->mixdev_node); mutex_unlock(&mousedev_mix->mutex); put_device(&mousedev->dev); } static int mousedev_connect(struct input_handler *handler, struct input_dev *dev, const struct input_device_id *id) { struct mousedev *mousedev; int error; mousedev = mousedev_create(dev, handler, false); if (IS_ERR(mousedev)) return PTR_ERR(mousedev); error = mixdev_add_device(mousedev); if (error) { mousedev_destroy(mousedev); return error; } return 0; } static void mousedev_disconnect(struct input_handle *handle) { struct mousedev *mousedev = handle->private; mixdev_remove_device(mousedev); mousedev_destroy(mousedev); } static const struct input_device_id mousedev_ids[] = { { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT | INPUT_DEVICE_ID_MATCH_RELBIT, .evbit = { BIT_MASK(EV_KEY) | BIT_MASK(EV_REL) }, .keybit = { [BIT_WORD(BTN_LEFT)] = BIT_MASK(BTN_LEFT) }, .relbit = { BIT_MASK(REL_X) | BIT_MASK(REL_Y) }, }, /* A mouse like device, at least one button, two relative axes */ { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_RELBIT, .evbit = { BIT_MASK(EV_KEY) | BIT_MASK(EV_REL) }, .relbit = { BIT_MASK(REL_WHEEL) }, }, /* A separate scrollwheel */ { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT | INPUT_DEVICE_ID_MATCH_ABSBIT, .evbit = { BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS) }, .keybit = { [BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH) }, .absbit = { BIT_MASK(ABS_X) | BIT_MASK(ABS_Y) }, }, /* A tablet like device, at least touch detection, two absolute axes */ { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT | INPUT_DEVICE_ID_MATCH_ABSBIT, .evbit = { BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS) }, .keybit = { [BIT_WORD(BTN_TOOL_FINGER)] = BIT_MASK(BTN_TOOL_FINGER) }, .absbit = { BIT_MASK(ABS_X) | BIT_MASK(ABS_Y) | BIT_MASK(ABS_PRESSURE) | BIT_MASK(ABS_TOOL_WIDTH) }, }, /* A touchpad */ { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT | INPUT_DEVICE_ID_MATCH_ABSBIT, .evbit = { BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS) }, .keybit = { [BIT_WORD(BTN_LEFT)] = BIT_MASK(BTN_LEFT) }, .absbit = { BIT_MASK(ABS_X) | BIT_MASK(ABS_Y) }, }, /* Mouse-like device with absolute X and Y but ordinary clicks, like hp ILO2 High Performance mouse */ { }, /* Terminating entry */ }; MODULE_DEVICE_TABLE(input, mousedev_ids); static struct input_handler mousedev_handler = { .event = mousedev_event, .connect = mousedev_connect, .disconnect = mousedev_disconnect, .legacy_minors = true, .minor = MOUSEDEV_MINOR_BASE, .name = "mousedev", .id_table = mousedev_ids, }; #ifdef CONFIG_INPUT_MOUSEDEV_PSAUX #include <linux/miscdevice.h> static struct miscdevice psaux_mouse = { .minor = PSMOUSE_MINOR, .name = "psaux", .fops = &mousedev_fops, }; static bool psaux_registered; static void __init mousedev_psaux_register(void) { int error; error = misc_register(&psaux_mouse); if (error) pr_warn("could not register psaux device, error: %d\n", error); else psaux_registered = true; } static void __exit mousedev_psaux_unregister(void) { if (psaux_registered) misc_deregister(&psaux_mouse); } #else static inline void mousedev_psaux_register(void) { } static inline void mousedev_psaux_unregister(void) { } #endif static int __init mousedev_init(void) { int error; mousedev_mix = mousedev_create(NULL, &mousedev_handler, true); if (IS_ERR(mousedev_mix)) return PTR_ERR(mousedev_mix); error = input_register_handler(&mousedev_handler); if (error) { mousedev_destroy(mousedev_mix); return error; } mousedev_psaux_register(); pr_info("PS/2 mouse device common for all mice\n"); return 0; } static void __exit mousedev_exit(void) { mousedev_psaux_unregister(); input_unregister_handler(&mousedev_handler); mousedev_destroy(mousedev_mix); } module_init(mousedev_init); module_exit(mousedev_exit); |
51 51 51 51 50 50 51 50 91 11 11 7 7 7 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 | // SPDX-License-Identifier: GPL-2.0 /* * KVM coalesced MMIO * * Copyright (c) 2008 Bull S.A.S. * Copyright 2009 Red Hat, Inc. and/or its affiliates. * * Author: Laurent Vivier <Laurent.Vivier@bull.net> * */ #include <kvm/iodev.h> #include <linux/kvm_host.h> #include <linux/slab.h> #include <linux/kvm.h> #include "coalesced_mmio.h" static inline struct kvm_coalesced_mmio_dev *to_mmio(struct kvm_io_device *dev) { return container_of(dev, struct kvm_coalesced_mmio_dev, dev); } static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev, gpa_t addr, int len) { /* is it in a batchable area ? * (addr,len) is fully included in * (zone->addr, zone->size) */ if (len < 0) return 0; if (addr + len < addr) return 0; if (addr < dev->zone.addr) return 0; if (addr + len > dev->zone.addr + dev->zone.size) return 0; return 1; } static int coalesced_mmio_has_room(struct kvm_coalesced_mmio_dev *dev, u32 last) { struct kvm_coalesced_mmio_ring *ring; unsigned avail; /* Are we able to batch it ? */ /* last is the first free entry * check if we don't meet the first used entry * there is always one unused entry in the buffer */ ring = dev->kvm->coalesced_mmio_ring; avail = (ring->first - last - 1) % KVM_COALESCED_MMIO_MAX; if (avail == 0) { /* full */ return 0; } return 1; } static int coalesced_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, int len, const void *val) { struct kvm_coalesced_mmio_dev *dev = to_mmio(this); struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring; __u32 insert; if (!coalesced_mmio_in_range(dev, addr, len)) return -EOPNOTSUPP; spin_lock(&dev->kvm->ring_lock); insert = READ_ONCE(ring->last); if (!coalesced_mmio_has_room(dev, insert) || insert >= KVM_COALESCED_MMIO_MAX) { spin_unlock(&dev->kvm->ring_lock); return -EOPNOTSUPP; } /* copy data in first free entry of the ring */ ring->coalesced_mmio[insert].phys_addr = addr; ring->coalesced_mmio[insert].len = len; memcpy(ring->coalesced_mmio[insert].data, val, len); smp_wmb(); ring->last = (insert + 1) % KVM_COALESCED_MMIO_MAX; spin_unlock(&dev->kvm->ring_lock); return 0; } static void coalesced_mmio_destructor(struct kvm_io_device *this) { struct kvm_coalesced_mmio_dev *dev = to_mmio(this); list_del(&dev->list); kfree(dev); } static const struct kvm_io_device_ops coalesced_mmio_ops = { .write = coalesced_mmio_write, .destructor = coalesced_mmio_destructor, }; int kvm_coalesced_mmio_init(struct kvm *kvm) { struct page *page; int ret; ret = -ENOMEM; page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) goto out_err; ret = 0; kvm->coalesced_mmio_ring = page_address(page); /* * We're using this spinlock to sync access to the coalesced ring. * The list doesn't need it's own lock since device registration and * unregistration should only happen when kvm->slots_lock is held. */ spin_lock_init(&kvm->ring_lock); INIT_LIST_HEAD(&kvm->coalesced_zones); out_err: return ret; } void kvm_coalesced_mmio_free(struct kvm *kvm) { if (kvm->coalesced_mmio_ring) free_page((unsigned long)kvm->coalesced_mmio_ring); } int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, struct kvm_coalesced_mmio_zone *zone) { int ret; struct kvm_coalesced_mmio_dev *dev; dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL); if (!dev) return -ENOMEM; kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops); dev->kvm = kvm; dev->zone = *zone; mutex_lock(&kvm->slots_lock); ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, zone->addr, zone->size, &dev->dev); if (ret < 0) goto out_free_dev; list_add_tail(&dev->list, &kvm->coalesced_zones); mutex_unlock(&kvm->slots_lock); return 0; out_free_dev: mutex_unlock(&kvm->slots_lock); kfree(dev); return ret; } int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm, struct kvm_coalesced_mmio_zone *zone) { struct kvm_coalesced_mmio_dev *dev, *tmp; mutex_lock(&kvm->slots_lock); list_for_each_entry_safe(dev, tmp, &kvm->coalesced_zones, list) if (coalesced_mmio_in_range(dev, zone->addr, zone->size)) { kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &dev->dev); kvm_iodevice_destructor(&dev->dev); } mutex_unlock(&kvm->slots_lock); return 0; } |
186 35 70 40 104 297 44 32 99 130 3 13 59 86 52 18 4 3 90 52 18 4 38 586 313 18 18 15221 15233 14 15 641 165 9 689 2 24 127 268 2 606 2 82 73 28 8 8 6 14 6 69 8 131 25 127 51 8 34 34 12 80 3 10031 164 181 186 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_XFRM_H #define _NET_XFRM_H #include <linux/compiler.h> #include <linux/xfrm.h> #include <linux/spinlock.h> #include <linux/list.h> #include <linux/skbuff.h> #include <linux/socket.h> #include <linux/pfkeyv2.h> #include <linux/ipsec.h> #include <linux/in6.h> #include <linux/mutex.h> #include <linux/audit.h> #include <linux/slab.h> #include <linux/refcount.h> #include <net/sock.h> #include <net/dst.h> #include <net/ip.h> #include <net/route.h> #include <net/ipv6.h> #include <net/ip6_fib.h> #include <net/flow.h> #include <net/gro_cells.h> #include <linux/interrupt.h> #ifdef CONFIG_XFRM_STATISTICS #include <net/snmp.h> #endif #define XFRM_PROTO_ESP 50 #define XFRM_PROTO_AH 51 #define XFRM_PROTO_COMP 108 #define XFRM_PROTO_IPIP 4 #define XFRM_PROTO_IPV6 41 #define XFRM_PROTO_ROUTING IPPROTO_ROUTING #define XFRM_PROTO_DSTOPTS IPPROTO_DSTOPTS #define XFRM_ALIGN4(len) (((len) + 3) & ~3) #define XFRM_ALIGN8(len) (((len) + 7) & ~7) #define MODULE_ALIAS_XFRM_MODE(family, encap) \ MODULE_ALIAS("xfrm-mode-" __stringify(family) "-" __stringify(encap)) #define MODULE_ALIAS_XFRM_TYPE(family, proto) \ MODULE_ALIAS("xfrm-type-" __stringify(family) "-" __stringify(proto)) #define MODULE_ALIAS_XFRM_OFFLOAD_TYPE(family, proto) \ MODULE_ALIAS("xfrm-offload-" __stringify(family) "-" __stringify(proto)) #ifdef CONFIG_XFRM_STATISTICS #define XFRM_INC_STATS(net, field) SNMP_INC_STATS((net)->mib.xfrm_statistics, field) #else #define XFRM_INC_STATS(net, field) ((void)(net)) #endif /* Organization of SPD aka "XFRM rules" ------------------------------------ Basic objects: - policy rule, struct xfrm_policy (=SPD entry) - bundle of transformations, struct dst_entry == struct xfrm_dst (=SA bundle) - instance of a transformer, struct xfrm_state (=SA) - template to clone xfrm_state, struct xfrm_tmpl SPD is plain linear list of xfrm_policy rules, ordered by priority. (To be compatible with existing pfkeyv2 implementations, many rules with priority of 0x7fffffff are allowed to exist and such rules are ordered in an unpredictable way, thanks to bsd folks.) Lookup is plain linear search until the first match with selector. If "action" is "block", then we prohibit the flow, otherwise: if "xfrms_nr" is zero, the flow passes untransformed. Otherwise, policy entry has list of up to XFRM_MAX_DEPTH transformations, described by templates xfrm_tmpl. Each template is resolved to a complete xfrm_state (see below) and we pack bundle of transformations to a dst_entry returned to requestor. dst -. xfrm .-> xfrm_state #1 |---. child .-> dst -. xfrm .-> xfrm_state #2 |---. child .-> dst -. xfrm .-> xfrm_state #3 |---. child .-> NULL Bundles are cached at xrfm_policy struct (field ->bundles). Resolution of xrfm_tmpl ----------------------- Template contains: 1. ->mode Mode: transport or tunnel 2. ->id.proto Protocol: AH/ESP/IPCOMP 3. ->id.daddr Remote tunnel endpoint, ignored for transport mode. Q: allow to resolve security gateway? 4. ->id.spi If not zero, static SPI. 5. ->saddr Local tunnel endpoint, ignored for transport mode. 6. ->algos List of allowed algos. Plain bitmask now. Q: ealgos, aalgos, calgos. What a mess... 7. ->share Sharing mode. Q: how to implement private sharing mode? To add struct sock* to flow id? Having this template we search through SAD searching for entries with appropriate mode/proto/algo, permitted by selector. If no appropriate entry found, it is requested from key manager. PROBLEMS: Q: How to find all the bundles referring to a physical path for PMTU discovery? Seems, dst should contain list of all parents... and enter to infinite locking hierarchy disaster. No! It is easier, we will not search for them, let them find us. We add genid to each dst plus pointer to genid of raw IP route, pmtu disc will update pmtu on raw IP route and increase its genid. dst_check() will see this for top level and trigger resyncing metrics. Plus, it will be made via sk->sk_dst_cache. Solved. */ struct xfrm_state_walk { struct list_head all; u8 state; u8 dying; u8 proto; u32 seq; struct xfrm_address_filter *filter; }; struct xfrm_state_offload { struct net_device *dev; unsigned long offload_handle; unsigned int num_exthdrs; u8 flags; }; /* Full description of state of transformer. */ struct xfrm_state { possible_net_t xs_net; union { struct hlist_node gclist; struct hlist_node bydst; }; struct hlist_node bysrc; struct hlist_node byspi; refcount_t refcnt; spinlock_t lock; struct xfrm_id id; struct xfrm_selector sel; struct xfrm_mark mark; u32 if_id; u32 tfcpad; u32 genid; /* Key manager bits */ struct xfrm_state_walk km; /* Parameters of this state. */ struct { u32 reqid; u8 mode; u8 replay_window; u8 aalgo, ealgo, calgo; u8 flags; u16 family; xfrm_address_t saddr; int header_len; int trailer_len; u32 extra_flags; struct xfrm_mark smark; } props; struct xfrm_lifetime_cfg lft; /* Data for transformer */ struct xfrm_algo_auth *aalg; struct xfrm_algo *ealg; struct xfrm_algo *calg; struct xfrm_algo_aead *aead; const char *geniv; /* Data for encapsulator */ struct xfrm_encap_tmpl *encap; /* Data for care-of address */ xfrm_address_t *coaddr; /* IPComp needs an IPIP tunnel for handling uncompressed packets */ struct xfrm_state *tunnel; /* If a tunnel, number of users + 1 */ atomic_t tunnel_users; /* State for replay detection */ struct xfrm_replay_state replay; struct xfrm_replay_state_esn *replay_esn; /* Replay detection state at the time we sent the last notification */ struct xfrm_replay_state preplay; struct xfrm_replay_state_esn *preplay_esn; /* The functions for replay detection. */ const struct xfrm_replay *repl; /* internal flag that only holds state for delayed aevent at the * moment */ u32 xflags; /* Replay detection notification settings */ u32 replay_maxage; u32 replay_maxdiff; /* Replay detection notification timer */ struct timer_list rtimer; /* Statistics */ struct xfrm_stats stats; struct xfrm_lifetime_cur curlft; struct tasklet_hrtimer mtimer; struct xfrm_state_offload xso; /* used to fix curlft->add_time when changing date */ long saved_tmo; /* Last used time */ time64_t lastused; struct page_frag xfrag; /* Reference to data common to all the instances of this * transformer. */ const struct xfrm_type *type; struct xfrm_mode *inner_mode; struct xfrm_mode *inner_mode_iaf; struct xfrm_mode *outer_mode; const struct xfrm_type_offload *type_offload; /* Security context */ struct xfrm_sec_ctx *security; /* Private data of this transformer, format is opaque, * interpreted by xfrm_type methods. */ void *data; }; static inline struct net *xs_net(struct xfrm_state *x) { return read_pnet(&x->xs_net); } /* xflags - make enum if more show up */ #define XFRM_TIME_DEFER 1 #define XFRM_SOFT_EXPIRE 2 enum { XFRM_STATE_VOID, XFRM_STATE_ACQ, XFRM_STATE_VALID, XFRM_STATE_ERROR, XFRM_STATE_EXPIRED, XFRM_STATE_DEAD }; /* callback structure passed from either netlink or pfkey */ struct km_event { union { u32 hard; u32 proto; u32 byid; u32 aevent; u32 type; } data; u32 seq; u32 portid; u32 event; struct net *net; }; struct xfrm_replay { void (*advance)(struct xfrm_state *x, __be32 net_seq); int (*check)(struct xfrm_state *x, struct sk_buff *skb, __be32 net_seq); int (*recheck)(struct xfrm_state *x, struct sk_buff *skb, __be32 net_seq); void (*notify)(struct xfrm_state *x, int event); int (*overflow)(struct xfrm_state *x, struct sk_buff *skb); }; struct xfrm_if_cb { struct xfrm_if *(*decode_session)(struct sk_buff *skb, unsigned short family); }; void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb); void xfrm_if_unregister_cb(void); struct net_device; struct xfrm_type; struct xfrm_dst; struct xfrm_policy_afinfo { struct dst_ops *dst_ops; struct dst_entry *(*dst_lookup)(struct net *net, int tos, int oif, const xfrm_address_t *saddr, const xfrm_address_t *daddr, u32 mark); int (*get_saddr)(struct net *net, int oif, xfrm_address_t *saddr, xfrm_address_t *daddr, u32 mark); void (*decode_session)(struct sk_buff *skb, struct flowi *fl, int reverse); int (*get_tos)(const struct flowi *fl); int (*init_path)(struct xfrm_dst *path, struct dst_entry *dst, int nfheader_len); int (*fill_dst)(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl); struct dst_entry *(*blackhole_route)(struct net *net, struct dst_entry *orig); }; int xfrm_policy_register_afinfo(const struct xfrm_policy_afinfo *afinfo, int family); void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo); void km_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c); void km_state_notify(struct xfrm_state *x, const struct km_event *c); struct xfrm_tmpl; int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol); void km_state_expired(struct xfrm_state *x, int hard, u32 portid); int __xfrm_state_delete(struct xfrm_state *x); struct xfrm_state_afinfo { unsigned int family; unsigned int proto; __be16 eth_proto; struct module *owner; const struct xfrm_type *type_map[IPPROTO_MAX]; const struct xfrm_type_offload *type_offload_map[IPPROTO_MAX]; struct xfrm_mode *mode_map[XFRM_MODE_MAX]; int (*init_flags)(struct xfrm_state *x); void (*init_tempsel)(struct xfrm_selector *sel, const struct flowi *fl); void (*init_temprop)(struct xfrm_state *x, const struct xfrm_tmpl *tmpl, const xfrm_address_t *daddr, const xfrm_address_t *saddr); int (*tmpl_sort)(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n); int (*state_sort)(struct xfrm_state **dst, struct xfrm_state **src, int n); int (*output)(struct net *net, struct sock *sk, struct sk_buff *skb); int (*output_finish)(struct sock *sk, struct sk_buff *skb); int (*extract_input)(struct xfrm_state *x, struct sk_buff *skb); int (*extract_output)(struct xfrm_state *x, struct sk_buff *skb); int (*transport_finish)(struct sk_buff *skb, int async); void (*local_error)(struct sk_buff *skb, u32 mtu); }; int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo); int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo); struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family); struct xfrm_state_afinfo *xfrm_state_afinfo_get_rcu(unsigned int family); struct xfrm_input_afinfo { unsigned int family; int (*callback)(struct sk_buff *skb, u8 protocol, int err); }; int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo); int xfrm_input_unregister_afinfo(const struct xfrm_input_afinfo *afinfo); void xfrm_flush_gc(void); void xfrm_state_delete_tunnel(struct xfrm_state *x); struct xfrm_type { char *description; struct module *owner; u8 proto; u8 flags; #define XFRM_TYPE_NON_FRAGMENT 1 #define XFRM_TYPE_REPLAY_PROT 2 #define XFRM_TYPE_LOCAL_COADDR 4 #define XFRM_TYPE_REMOTE_COADDR 8 int (*init_state)(struct xfrm_state *x); void (*destructor)(struct xfrm_state *); int (*input)(struct xfrm_state *, struct sk_buff *skb); int (*output)(struct xfrm_state *, struct sk_buff *pskb); int (*reject)(struct xfrm_state *, struct sk_buff *, const struct flowi *); int (*hdr_offset)(struct xfrm_state *, struct sk_buff *, u8 **); /* Estimate maximal size of result of transformation of a dgram */ u32 (*get_mtu)(struct xfrm_state *, int size); }; int xfrm_register_type(const struct xfrm_type *type, unsigned short family); int xfrm_unregister_type(const struct xfrm_type *type, unsigned short family); struct xfrm_type_offload { char *description; struct module *owner; u8 proto; void (*encap)(struct xfrm_state *, struct sk_buff *pskb); int (*input_tail)(struct xfrm_state *x, struct sk_buff *skb); int (*xmit)(struct xfrm_state *, struct sk_buff *pskb, netdev_features_t features); }; int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned short family); int xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family); struct xfrm_mode { /* * Remove encapsulation header. * * The IP header will be moved over the top of the encapsulation * header. * * On entry, the transport header shall point to where the IP header * should be and the network header shall be set to where the IP * header currently is. skb->data shall point to the start of the * payload. */ int (*input2)(struct xfrm_state *x, struct sk_buff *skb); /* * This is the actual input entry point. * * For transport mode and equivalent this would be identical to * input2 (which does not need to be set). While tunnel mode * and equivalent would set this to the tunnel encapsulation function * xfrm4_prepare_input that would in turn call input2. */ int (*input)(struct xfrm_state *x, struct sk_buff *skb); /* * Add encapsulation header. * * On exit, the transport header will be set to the start of the * encapsulation header to be filled in by x->type->output and * the mac header will be set to the nextheader (protocol for * IPv4) field of the extension header directly preceding the * encapsulation header, or in its absence, that of the top IP * header. The value of the network header will always point * to the top IP header while skb->data will point to the payload. */ int (*output2)(struct xfrm_state *x,struct sk_buff *skb); /* * This is the actual output entry point. * * For transport mode and equivalent this would be identical to * output2 (which does not need to be set). While tunnel mode * and equivalent would set this to a tunnel encapsulation function * (xfrm4_prepare_output or xfrm6_prepare_output) that would in turn * call output2. */ int (*output)(struct xfrm_state *x, struct sk_buff *skb); /* * Adjust pointers into the packet and do GSO segmentation. */ struct sk_buff *(*gso_segment)(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features); /* * Adjust pointers into the packet when IPsec is done at layer2. */ void (*xmit)(struct xfrm_state *x, struct sk_buff *skb); struct xfrm_state_afinfo *afinfo; struct module *owner; unsigned int encap; int flags; }; /* Flags for xfrm_mode. */ enum { XFRM_MODE_FLAG_TUNNEL = 1, }; int xfrm_register_mode(struct xfrm_mode *mode, int family); int xfrm_unregister_mode(struct xfrm_mode *mode, int family); static inline int xfrm_af2proto(unsigned int family) { switch(family) { case AF_INET: return IPPROTO_IPIP; case AF_INET6: return IPPROTO_IPV6; default: return 0; } } static inline struct xfrm_mode *xfrm_ip2inner_mode(struct xfrm_state *x, int ipproto) { if ((ipproto == IPPROTO_IPIP && x->props.family == AF_INET) || (ipproto == IPPROTO_IPV6 && x->props.family == AF_INET6)) return x->inner_mode; else return x->inner_mode_iaf; } struct xfrm_tmpl { /* id in template is interpreted as: * daddr - destination of tunnel, may be zero for transport mode. * spi - zero to acquire spi. Not zero if spi is static, then * daddr must be fixed too. * proto - AH/ESP/IPCOMP */ struct xfrm_id id; /* Source address of tunnel. Ignored, if it is not a tunnel. */ xfrm_address_t saddr; unsigned short encap_family; u32 reqid; /* Mode: transport, tunnel etc. */ u8 mode; /* Sharing mode: unique, this session only, this user only etc. */ u8 share; /* May skip this transfomration if no SA is found */ u8 optional; /* Skip aalgos/ealgos/calgos checks. */ u8 allalgs; /* Bit mask of algos allowed for acquisition */ u32 aalgos; u32 ealgos; u32 calgos; }; #define XFRM_MAX_DEPTH 6 #define XFRM_MAX_OFFLOAD_DEPTH 1 struct xfrm_policy_walk_entry { struct list_head all; u8 dead; }; struct xfrm_policy_walk { struct xfrm_policy_walk_entry walk; u8 type; u32 seq; }; struct xfrm_policy_queue { struct sk_buff_head hold_queue; struct timer_list hold_timer; unsigned long timeout; }; struct xfrm_policy { possible_net_t xp_net; struct hlist_node bydst; struct hlist_node byidx; /* This lock only affects elements except for entry. */ rwlock_t lock; refcount_t refcnt; struct timer_list timer; atomic_t genid; u32 priority; u32 index; u32 if_id; struct xfrm_mark mark; struct xfrm_selector selector; struct xfrm_lifetime_cfg lft; struct xfrm_lifetime_cur curlft; struct xfrm_policy_walk_entry walk; struct xfrm_policy_queue polq; u8 type; u8 action; u8 flags; u8 xfrm_nr; u16 family; struct xfrm_sec_ctx *security; struct xfrm_tmpl xfrm_vec[XFRM_MAX_DEPTH]; struct rcu_head rcu; }; static inline struct net *xp_net(const struct xfrm_policy *xp) { return read_pnet(&xp->xp_net); } struct xfrm_kmaddress { xfrm_address_t local; xfrm_address_t remote; u32 reserved; u16 family; }; struct xfrm_migrate { xfrm_address_t old_daddr; xfrm_address_t old_saddr; xfrm_address_t new_daddr; xfrm_address_t new_saddr; u8 proto; u8 mode; u16 reserved; u32 reqid; u16 old_family; u16 new_family; }; #define XFRM_KM_TIMEOUT 30 /* what happened */ #define XFRM_REPLAY_UPDATE XFRM_AE_CR #define XFRM_REPLAY_TIMEOUT XFRM_AE_CE /* default aevent timeout in units of 100ms */ #define XFRM_AE_ETIME 10 /* Async Event timer multiplier */ #define XFRM_AE_ETH_M 10 /* default seq threshold size */ #define XFRM_AE_SEQT_SIZE 2 struct xfrm_mgr { struct list_head list; int (*notify)(struct xfrm_state *x, const struct km_event *c); int (*acquire)(struct xfrm_state *x, struct xfrm_tmpl *, struct xfrm_policy *xp); struct xfrm_policy *(*compile_policy)(struct sock *sk, int opt, u8 *data, int len, int *dir); int (*new_mapping)(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport); int (*notify_policy)(struct xfrm_policy *x, int dir, const struct km_event *c); int (*report)(struct net *net, u8 proto, struct xfrm_selector *sel, xfrm_address_t *addr); int (*migrate)(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_bundles, const struct xfrm_kmaddress *k, const struct xfrm_encap_tmpl *encap); bool (*is_alive)(const struct km_event *c); }; int xfrm_register_km(struct xfrm_mgr *km); int xfrm_unregister_km(struct xfrm_mgr *km); struct xfrm_tunnel_skb_cb { union { struct inet_skb_parm h4; struct inet6_skb_parm h6; } header; union { struct ip_tunnel *ip4; struct ip6_tnl *ip6; } tunnel; }; #define XFRM_TUNNEL_SKB_CB(__skb) ((struct xfrm_tunnel_skb_cb *)&((__skb)->cb[0])) /* * This structure is used for the duration where packets are being * transformed by IPsec. As soon as the packet leaves IPsec the * area beyond the generic IP part may be overwritten. */ struct xfrm_skb_cb { struct xfrm_tunnel_skb_cb header; /* Sequence number for replay protection. */ union { struct { __u32 low; __u32 hi; } output; struct { __be32 low; __be32 hi; } input; } seq; }; #define XFRM_SKB_CB(__skb) ((struct xfrm_skb_cb *)&((__skb)->cb[0])) /* * This structure is used by the afinfo prepare_input/prepare_output functions * to transmit header information to the mode input/output functions. */ struct xfrm_mode_skb_cb { struct xfrm_tunnel_skb_cb header; /* Copied from header for IPv4, always set to zero and DF for IPv6. */ __be16 id; __be16 frag_off; /* IP header length (excluding options or extension headers). */ u8 ihl; /* TOS for IPv4, class for IPv6. */ u8 tos; /* TTL for IPv4, hop limitfor IPv6. */ u8 ttl; /* Protocol for IPv4, NH for IPv6. */ u8 protocol; /* Option length for IPv4, zero for IPv6. */ u8 optlen; /* Used by IPv6 only, zero for IPv4. */ u8 flow_lbl[3]; }; #define XFRM_MODE_SKB_CB(__skb) ((struct xfrm_mode_skb_cb *)&((__skb)->cb[0])) /* * This structure is used by the input processing to locate the SPI and * related information. */ struct xfrm_spi_skb_cb { struct xfrm_tunnel_skb_cb header; unsigned int daddroff; unsigned int family; __be32 seq; }; #define XFRM_SPI_SKB_CB(__skb) ((struct xfrm_spi_skb_cb *)&((__skb)->cb[0])) #ifdef CONFIG_AUDITSYSCALL static inline struct audit_buffer *xfrm_audit_start(const char *op) { struct audit_buffer *audit_buf = NULL; if (audit_enabled == AUDIT_OFF) return NULL; audit_buf = audit_log_start(audit_context(), GFP_ATOMIC, AUDIT_MAC_IPSEC_EVENT); if (audit_buf == NULL) return NULL; audit_log_format(audit_buf, "op=%s", op); return audit_buf; } static inline void xfrm_audit_helper_usrinfo(bool task_valid, struct audit_buffer *audit_buf) { const unsigned int auid = from_kuid(&init_user_ns, task_valid ? audit_get_loginuid(current) : INVALID_UID); const unsigned int ses = task_valid ? audit_get_sessionid(current) : AUDIT_SID_UNSET; audit_log_format(audit_buf, " auid=%u ses=%u", auid, ses); audit_log_task_context(audit_buf); } void xfrm_audit_policy_add(struct xfrm_policy *xp, int result, bool task_valid); void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result, bool task_valid); void xfrm_audit_state_add(struct xfrm_state *x, int result, bool task_valid); void xfrm_audit_state_delete(struct xfrm_state *x, int result, bool task_valid); void xfrm_audit_state_replay_overflow(struct xfrm_state *x, struct sk_buff *skb); void xfrm_audit_state_replay(struct xfrm_state *x, struct sk_buff *skb, __be32 net_seq); void xfrm_audit_state_notfound_simple(struct sk_buff *skb, u16 family); void xfrm_audit_state_notfound(struct sk_buff *skb, u16 family, __be32 net_spi, __be32 net_seq); void xfrm_audit_state_icvfail(struct xfrm_state *x, struct sk_buff *skb, u8 proto); #else static inline void xfrm_audit_policy_add(struct xfrm_policy *xp, int result, bool task_valid) { } static inline void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result, bool task_valid) { } static inline void xfrm_audit_state_add(struct xfrm_state *x, int result, bool task_valid) { } static inline void xfrm_audit_state_delete(struct xfrm_state *x, int result, bool task_valid) { } static inline void xfrm_audit_state_replay_overflow(struct xfrm_state *x, struct sk_buff *skb) { } static inline void xfrm_audit_state_replay(struct xfrm_state *x, struct sk_buff *skb, __be32 net_seq) { } static inline void xfrm_audit_state_notfound_simple(struct sk_buff *skb, u16 family) { } static inline void xfrm_audit_state_notfound(struct sk_buff *skb, u16 family, __be32 net_spi, __be32 net_seq) { } static inline void xfrm_audit_state_icvfail(struct xfrm_state *x, struct sk_buff *skb, u8 proto) { } #endif /* CONFIG_AUDITSYSCALL */ static inline void xfrm_pol_hold(struct xfrm_policy *policy) { if (likely(policy != NULL)) refcount_inc(&policy->refcnt); } void xfrm_policy_destroy(struct xfrm_policy *policy); static inline void xfrm_pol_put(struct xfrm_policy *policy) { if (refcount_dec_and_test(&policy->refcnt)) xfrm_policy_destroy(policy); } static inline void xfrm_pols_put(struct xfrm_policy **pols, int npols) { int i; for (i = npols - 1; i >= 0; --i) xfrm_pol_put(pols[i]); } void __xfrm_state_destroy(struct xfrm_state *, bool); static inline void __xfrm_state_put(struct xfrm_state *x) { refcount_dec(&x->refcnt); } static inline void xfrm_state_put(struct xfrm_state *x) { if (refcount_dec_and_test(&x->refcnt)) __xfrm_state_destroy(x, false); } static inline void xfrm_state_put_sync(struct xfrm_state *x) { if (refcount_dec_and_test(&x->refcnt)) __xfrm_state_destroy(x, true); } static inline void xfrm_state_hold(struct xfrm_state *x) { refcount_inc(&x->refcnt); } static inline bool addr_match(const void *token1, const void *token2, unsigned int prefixlen) { const __be32 *a1 = token1; const __be32 *a2 = token2; unsigned int pdw; unsigned int pbi; pdw = prefixlen >> 5; /* num of whole u32 in prefix */ pbi = prefixlen & 0x1f; /* num of bits in incomplete u32 in prefix */ if (pdw) if (memcmp(a1, a2, pdw << 2)) return false; if (pbi) { __be32 mask; mask = htonl((0xffffffff) << (32 - pbi)); if ((a1[pdw] ^ a2[pdw]) & mask) return false; } return true; } static inline bool addr4_match(__be32 a1, __be32 a2, u8 prefixlen) { /* C99 6.5.7 (3): u32 << 32 is undefined behaviour */ if (sizeof(long) == 4 && prefixlen == 0) return true; return !((a1 ^ a2) & htonl(~0UL << (32 - prefixlen))); } static __inline__ __be16 xfrm_flowi_sport(const struct flowi *fl, const union flowi_uli *uli) { __be16 port; switch(fl->flowi_proto) { case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_UDPLITE: case IPPROTO_SCTP: port = uli->ports.sport; break; case IPPROTO_ICMP: case IPPROTO_ICMPV6: port = htons(uli->icmpt.type); break; case IPPROTO_MH: port = htons(uli->mht.type); break; case IPPROTO_GRE: port = htons(ntohl(uli->gre_key) >> 16); break; default: port = 0; /*XXX*/ } return port; } static __inline__ __be16 xfrm_flowi_dport(const struct flowi *fl, const union flowi_uli *uli) { __be16 port; switch(fl->flowi_proto) { case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_UDPLITE: case IPPROTO_SCTP: port = uli->ports.dport; break; case IPPROTO_ICMP: case IPPROTO_ICMPV6: port = htons(uli->icmpt.code); break; case IPPROTO_GRE: port = htons(ntohl(uli->gre_key) & 0xffff); break; default: port = 0; /*XXX*/ } return port; } bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl, unsigned short family); #ifdef CONFIG_SECURITY_NETWORK_XFRM /* If neither has a context --> match * Otherwise, both must have a context and the sids, doi, alg must match */ static inline bool xfrm_sec_ctx_match(struct xfrm_sec_ctx *s1, struct xfrm_sec_ctx *s2) { return ((!s1 && !s2) || (s1 && s2 && (s1->ctx_sid == s2->ctx_sid) && (s1->ctx_doi == s2->ctx_doi) && (s1->ctx_alg == s2->ctx_alg))); } #else static inline bool xfrm_sec_ctx_match(struct xfrm_sec_ctx *s1, struct xfrm_sec_ctx *s2) { return true; } #endif /* A struct encoding bundle of transformations to apply to some set of flow. * * xdst->child points to the next element of bundle. * dst->xfrm points to an instanse of transformer. * * Due to unfortunate limitations of current routing cache, which we * have no time to fix, it mirrors struct rtable and bound to the same * routing key, including saddr,daddr. However, we can have many of * bundles differing by session id. All the bundles grow from a parent * policy rule. */ struct xfrm_dst { union { struct dst_entry dst; struct rtable rt; struct rt6_info rt6; } u; struct dst_entry *route; struct dst_entry *child; struct dst_entry *path; struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; int num_pols, num_xfrms; u32 xfrm_genid; u32 policy_genid; u32 route_mtu_cached; u32 child_mtu_cached; u32 route_cookie; u32 path_cookie; }; static inline struct dst_entry *xfrm_dst_path(const struct dst_entry *dst) { #ifdef CONFIG_XFRM if (dst->xfrm || (dst->flags & DST_XFRM_QUEUE)) { const struct xfrm_dst *xdst = (const struct xfrm_dst *) dst; return xdst->path; } #endif return (struct dst_entry *) dst; } static inline struct dst_entry *xfrm_dst_child(const struct dst_entry *dst) { #ifdef CONFIG_XFRM if (dst->xfrm || (dst->flags & DST_XFRM_QUEUE)) { struct xfrm_dst *xdst = (struct xfrm_dst *) dst; return xdst->child; } #endif return NULL; } #ifdef CONFIG_XFRM static inline void xfrm_dst_set_child(struct xfrm_dst *xdst, struct dst_entry *child) { xdst->child = child; } static inline void xfrm_dst_destroy(struct xfrm_dst *xdst) { xfrm_pols_put(xdst->pols, xdst->num_pols); dst_release(xdst->route); if (likely(xdst->u.dst.xfrm)) xfrm_state_put(xdst->u.dst.xfrm); } #endif void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev); struct xfrm_if_parms { int link; /* ifindex of underlying L2 interface */ u32 if_id; /* interface identifyer */ }; struct xfrm_if { struct xfrm_if __rcu *next; /* next interface in list */ struct net_device *dev; /* virtual device associated with interface */ struct net *net; /* netns for packet i/o */ struct xfrm_if_parms p; /* interface parms */ struct gro_cells gro_cells; }; struct xfrm_offload { /* Output sequence number for replay protection on offloading. */ struct { __u32 low; __u32 hi; } seq; __u32 flags; #define SA_DELETE_REQ 1 #define CRYPTO_DONE 2 #define CRYPTO_NEXT_DONE 4 #define CRYPTO_FALLBACK 8 #define XFRM_GSO_SEGMENT 16 #define XFRM_GRO 32 #define XFRM_ESP_NO_TRAILER 64 #define XFRM_DEV_RESUME 128 #define XFRM_XMIT 256 __u32 status; #define CRYPTO_SUCCESS 1 #define CRYPTO_GENERIC_ERROR 2 #define CRYPTO_TRANSPORT_AH_AUTH_FAILED 4 #define CRYPTO_TRANSPORT_ESP_AUTH_FAILED 8 #define CRYPTO_TUNNEL_AH_AUTH_FAILED 16 #define CRYPTO_TUNNEL_ESP_AUTH_FAILED 32 #define CRYPTO_INVALID_PACKET_SYNTAX 64 #define CRYPTO_INVALID_PROTOCOL 128 __u8 proto; }; struct sec_path { refcount_t refcnt; int len; int olen; struct xfrm_state *xvec[XFRM_MAX_DEPTH]; struct xfrm_offload ovec[XFRM_MAX_OFFLOAD_DEPTH]; }; static inline int secpath_exists(struct sk_buff *skb) { #ifdef CONFIG_XFRM return skb->sp != NULL; #else return 0; #endif } static inline struct sec_path * secpath_get(struct sec_path *sp) { if (sp) refcount_inc(&sp->refcnt); return sp; } void __secpath_destroy(struct sec_path *sp); static inline void secpath_put(struct sec_path *sp) { if (sp && refcount_dec_and_test(&sp->refcnt)) __secpath_destroy(sp); } struct sec_path *secpath_dup(struct sec_path *src); int secpath_set(struct sk_buff *skb); static inline void secpath_reset(struct sk_buff *skb) { #ifdef CONFIG_XFRM secpath_put(skb->sp); skb->sp = NULL; #endif } static inline int xfrm_addr_any(const xfrm_address_t *addr, unsigned short family) { switch (family) { case AF_INET: return addr->a4 == 0; case AF_INET6: return ipv6_addr_any(&addr->in6); } return 0; } static inline int __xfrm4_state_addr_cmp(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x) { return (tmpl->saddr.a4 && tmpl->saddr.a4 != x->props.saddr.a4); } static inline int __xfrm6_state_addr_cmp(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x) { return (!ipv6_addr_any((struct in6_addr*)&tmpl->saddr) && !ipv6_addr_equal((struct in6_addr *)&tmpl->saddr, (struct in6_addr*)&x->props.saddr)); } static inline int xfrm_state_addr_cmp(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x, unsigned short family) { switch (family) { case AF_INET: return __xfrm4_state_addr_cmp(tmpl, x); case AF_INET6: return __xfrm6_state_addr_cmp(tmpl, x); } return !0; } #ifdef CONFIG_XFRM int __xfrm_policy_check(struct sock *, int dir, struct sk_buff *skb, unsigned short family); static inline int __xfrm_policy_check2(struct sock *sk, int dir, struct sk_buff *skb, unsigned int family, int reverse) { struct net *net = dev_net(skb->dev); int ndir = dir | (reverse ? XFRM_POLICY_MASK + 1 : 0); if (sk && sk->sk_policy[XFRM_POLICY_IN]) return __xfrm_policy_check(sk, ndir, skb, family); return (!net->xfrm.policy_count[dir] && !skb->sp) || (skb_dst(skb)->flags & DST_NOPOLICY) || __xfrm_policy_check(sk, ndir, skb, family); } static inline int xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, unsigned short family) { return __xfrm_policy_check2(sk, dir, skb, family, 0); } static inline int xfrm4_policy_check(struct sock *sk, int dir, struct sk_buff *skb) { return xfrm_policy_check(sk, dir, skb, AF_INET); } static inline int xfrm6_policy_check(struct sock *sk, int dir, struct sk_buff *skb) { return xfrm_policy_check(sk, dir, skb, AF_INET6); } static inline int xfrm4_policy_check_reverse(struct sock *sk, int dir, struct sk_buff *skb) { return __xfrm_policy_check2(sk, dir, skb, AF_INET, 1); } static inline int xfrm6_policy_check_reverse(struct sock *sk, int dir, struct sk_buff *skb) { return __xfrm_policy_check2(sk, dir, skb, AF_INET6, 1); } int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned int family, int reverse); static inline int xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned int family) { return __xfrm_decode_session(skb, fl, family, 0); } static inline int xfrm_decode_session_reverse(struct sk_buff *skb, struct flowi *fl, unsigned int family) { return __xfrm_decode_session(skb, fl, family, 1); } int __xfrm_route_forward(struct sk_buff *skb, unsigned short family); static inline int xfrm_route_forward(struct sk_buff *skb, unsigned short family) { struct net *net = dev_net(skb->dev); return !net->xfrm.policy_count[XFRM_POLICY_OUT] || (skb_dst(skb)->flags & DST_NOXFRM) || __xfrm_route_forward(skb, family); } static inline int xfrm4_route_forward(struct sk_buff *skb) { return xfrm_route_forward(skb, AF_INET); } static inline int xfrm6_route_forward(struct sk_buff *skb) { return xfrm_route_forward(skb, AF_INET6); } int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk); static inline int xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) { sk->sk_policy[0] = NULL; sk->sk_policy[1] = NULL; if (unlikely(osk->sk_policy[0] || osk->sk_policy[1])) return __xfrm_sk_clone_policy(sk, osk); return 0; } int xfrm_policy_delete(struct xfrm_policy *pol, int dir); static inline void xfrm_sk_free_policy(struct sock *sk) { struct xfrm_policy *pol; pol = rcu_dereference_protected(sk->sk_policy[0], 1); if (unlikely(pol != NULL)) { xfrm_policy_delete(pol, XFRM_POLICY_MAX); sk->sk_policy[0] = NULL; } pol = rcu_dereference_protected(sk->sk_policy[1], 1); if (unlikely(pol != NULL)) { xfrm_policy_delete(pol, XFRM_POLICY_MAX+1); sk->sk_policy[1] = NULL; } } #else static inline void xfrm_sk_free_policy(struct sock *sk) {} static inline int xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) { return 0; } static inline int xfrm6_route_forward(struct sk_buff *skb) { return 1; } static inline int xfrm4_route_forward(struct sk_buff *skb) { return 1; } static inline int xfrm6_policy_check(struct sock *sk, int dir, struct sk_buff *skb) { return 1; } static inline int xfrm4_policy_check(struct sock *sk, int dir, struct sk_buff *skb) { return 1; } static inline int xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, unsigned short family) { return 1; } static inline int xfrm_decode_session_reverse(struct sk_buff *skb, struct flowi *fl, unsigned int family) { return -ENOSYS; } static inline int xfrm4_policy_check_reverse(struct sock *sk, int dir, struct sk_buff *skb) { return 1; } static inline int xfrm6_policy_check_reverse(struct sock *sk, int dir, struct sk_buff *skb) { return 1; } #endif static __inline__ xfrm_address_t *xfrm_flowi_daddr(const struct flowi *fl, unsigned short family) { switch (family){ case AF_INET: return (xfrm_address_t *)&fl->u.ip4.daddr; case AF_INET6: return (xfrm_address_t *)&fl->u.ip6.daddr; } return NULL; } static __inline__ xfrm_address_t *xfrm_flowi_saddr(const struct flowi *fl, unsigned short family) { switch (family){ case AF_INET: return (xfrm_address_t *)&fl->u.ip4.saddr; case AF_INET6: return (xfrm_address_t *)&fl->u.ip6.saddr; } return NULL; } static __inline__ void xfrm_flowi_addr_get(const struct flowi *fl, xfrm_address_t *saddr, xfrm_address_t *daddr, unsigned short family) { switch(family) { case AF_INET: memcpy(&saddr->a4, &fl->u.ip4.saddr, sizeof(saddr->a4)); memcpy(&daddr->a4, &fl->u.ip4.daddr, sizeof(daddr->a4)); break; case AF_INET6: saddr->in6 = fl->u.ip6.saddr; daddr->in6 = fl->u.ip6.daddr; break; } } static __inline__ int __xfrm4_state_addr_check(const struct xfrm_state *x, const xfrm_address_t *daddr, const xfrm_address_t *saddr) { if (daddr->a4 == x->id.daddr.a4 && (saddr->a4 == x->props.saddr.a4 || !saddr->a4 || !x->props.saddr.a4)) return 1; return 0; } static __inline__ int __xfrm6_state_addr_check(const struct xfrm_state *x, const xfrm_address_t *daddr, const xfrm_address_t *saddr) { if (ipv6_addr_equal((struct in6_addr *)daddr, (struct in6_addr *)&x->id.daddr) && (ipv6_addr_equal((struct in6_addr *)saddr, (struct in6_addr *)&x->props.saddr) || ipv6_addr_any((struct in6_addr *)saddr) || ipv6_addr_any((struct in6_addr *)&x->props.saddr))) return 1; return 0; } static __inline__ int xfrm_state_addr_check(const struct xfrm_state *x, const xfrm_address_t *daddr, const xfrm_address_t *saddr, unsigned short family) { switch (family) { case AF_INET: return __xfrm4_state_addr_check(x, daddr, saddr); case AF_INET6: return __xfrm6_state_addr_check(x, daddr, saddr); } return 0; } static __inline__ int xfrm_state_addr_flow_check(const struct xfrm_state *x, const struct flowi *fl, unsigned short family) { switch (family) { case AF_INET: return __xfrm4_state_addr_check(x, (const xfrm_address_t *)&fl->u.ip4.daddr, (const xfrm_address_t *)&fl->u.ip4.saddr); case AF_INET6: return __xfrm6_state_addr_check(x, (const xfrm_address_t *)&fl->u.ip6.daddr, (const xfrm_address_t *)&fl->u.ip6.saddr); } return 0; } static inline int xfrm_state_kern(const struct xfrm_state *x) { return atomic_read(&x->tunnel_users); } static inline bool xfrm_id_proto_valid(u8 proto) { switch (proto) { case IPPROTO_AH: case IPPROTO_ESP: case IPPROTO_COMP: #if IS_ENABLED(CONFIG_IPV6) case IPPROTO_ROUTING: case IPPROTO_DSTOPTS: #endif return true; default: return false; } } /* IPSEC_PROTO_ANY only matches 3 IPsec protocols, 0 could match all. */ static inline int xfrm_id_proto_match(u8 proto, u8 userproto) { return (!userproto || proto == userproto || (userproto == IPSEC_PROTO_ANY && (proto == IPPROTO_AH || proto == IPPROTO_ESP || proto == IPPROTO_COMP))); } /* * xfrm algorithm information */ struct xfrm_algo_aead_info { char *geniv; u16 icv_truncbits; }; struct xfrm_algo_auth_info { u16 icv_truncbits; u16 icv_fullbits; }; struct xfrm_algo_encr_info { char *geniv; u16 blockbits; u16 defkeybits; }; struct xfrm_algo_comp_info { u16 threshold; }; struct xfrm_algo_desc { char *name; char *compat; u8 available:1; u8 pfkey_supported:1; union { struct xfrm_algo_aead_info aead; struct xfrm_algo_auth_info auth; struct xfrm_algo_encr_info encr; struct xfrm_algo_comp_info comp; } uinfo; struct sadb_alg desc; }; /* XFRM protocol handlers. */ struct xfrm4_protocol { int (*handler)(struct sk_buff *skb); int (*input_handler)(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type); int (*cb_handler)(struct sk_buff *skb, int err); int (*err_handler)(struct sk_buff *skb, u32 info); struct xfrm4_protocol __rcu *next; int priority; }; struct xfrm6_protocol { int (*handler)(struct sk_buff *skb); int (*cb_handler)(struct sk_buff *skb, int err); int (*err_handler)(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info); struct xfrm6_protocol __rcu *next; int priority; }; /* XFRM tunnel handlers. */ struct xfrm_tunnel { int (*handler)(struct sk_buff *skb); int (*err_handler)(struct sk_buff *skb, u32 info); struct xfrm_tunnel __rcu *next; int priority; }; struct xfrm6_tunnel { int (*handler)(struct sk_buff *skb); int (*err_handler)(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info); struct xfrm6_tunnel __rcu *next; int priority; }; void xfrm_init(void); void xfrm4_init(void); int xfrm_state_init(struct net *net); void xfrm_state_fini(struct net *net); void xfrm4_state_init(void); void xfrm4_protocol_init(void); #ifdef CONFIG_XFRM int xfrm6_init(void); void xfrm6_fini(void); int xfrm6_state_init(void); void xfrm6_state_fini(void); int xfrm6_protocol_init(void); void xfrm6_protocol_fini(void); #else static inline int xfrm6_init(void) { return 0; } static inline void xfrm6_fini(void) { ; } #endif #ifdef CONFIG_XFRM_STATISTICS int xfrm_proc_init(struct net *net); void xfrm_proc_fini(struct net *net); #endif int xfrm_sysctl_init(struct net *net); #ifdef CONFIG_SYSCTL void xfrm_sysctl_fini(struct net *net); #else static inline void xfrm_sysctl_fini(struct net *net) { } #endif void xfrm_state_walk_init(struct xfrm_state_walk *walk, u8 proto, struct xfrm_address_filter *filter); int xfrm_state_walk(struct net *net, struct xfrm_state_walk *walk, int (*func)(struct xfrm_state *, int, void*), void *); void xfrm_state_walk_done(struct xfrm_state_walk *walk, struct net *net); struct xfrm_state *xfrm_state_alloc(struct net *net); void xfrm_state_free(struct xfrm_state *x); struct xfrm_state *xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, const struct flowi *fl, struct xfrm_tmpl *tmpl, struct xfrm_policy *pol, int *err, unsigned short family, u32 if_id); struct xfrm_state *xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id, xfrm_address_t *daddr, xfrm_address_t *saddr, unsigned short family, u8 mode, u8 proto, u32 reqid); struct xfrm_state *xfrm_state_lookup_byspi(struct net *net, __be32 spi, unsigned short family); int xfrm_state_check_expire(struct xfrm_state *x); void xfrm_state_insert(struct xfrm_state *x); int xfrm_state_add(struct xfrm_state *x); int xfrm_state_update(struct xfrm_state *x); struct xfrm_state *xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family); struct xfrm_state *xfrm_state_lookup_byaddr(struct net *net, u32 mark, const xfrm_address_t *daddr, const xfrm_address_t *saddr, u8 proto, unsigned short family); #ifdef CONFIG_XFRM_SUB_POLICY int xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n, unsigned short family, struct net *net); int xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n, unsigned short family); #else static inline int xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n, unsigned short family, struct net *net) { return -ENOSYS; } static inline int xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n, unsigned short family) { return -ENOSYS; } #endif struct xfrmk_sadinfo { u32 sadhcnt; /* current hash bkts */ u32 sadhmcnt; /* max allowed hash bkts */ u32 sadcnt; /* current running count */ }; struct xfrmk_spdinfo { u32 incnt; u32 outcnt; u32 fwdcnt; u32 inscnt; u32 outscnt; u32 fwdscnt; u32 spdhcnt; u32 spdhmcnt; }; struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq); int xfrm_state_delete(struct xfrm_state *x); int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync); int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid); void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si); void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si); u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq); int xfrm_init_replay(struct xfrm_state *x); int xfrm_state_mtu(struct xfrm_state *x, int mtu); int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload); int xfrm_init_state(struct xfrm_state *x); int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb); int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type); int xfrm_input_resume(struct sk_buff *skb, int nexthdr); int xfrm_trans_queue(struct sk_buff *skb, int (*finish)(struct net *, struct sock *, struct sk_buff *)); int xfrm_output_resume(struct sk_buff *skb, int err); int xfrm_output(struct sock *sk, struct sk_buff *skb); int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb); void xfrm_local_error(struct sk_buff *skb, int mtu); int xfrm4_extract_header(struct sk_buff *skb); int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb); int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type); int xfrm4_transport_finish(struct sk_buff *skb, int async); int xfrm4_rcv(struct sk_buff *skb); int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq); static inline int xfrm4_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi) { XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; XFRM_SPI_SKB_CB(skb)->family = AF_INET; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); return xfrm_input(skb, nexthdr, spi, 0); } int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb); int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb); int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb); int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb); int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err); int xfrm4_protocol_register(struct xfrm4_protocol *handler, unsigned char protocol); int xfrm4_protocol_deregister(struct xfrm4_protocol *handler, unsigned char protocol); int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family); int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family); void xfrm4_local_error(struct sk_buff *skb, u32 mtu); int xfrm6_extract_header(struct sk_buff *skb); int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb); int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi, struct ip6_tnl *t); int xfrm6_transport_finish(struct sk_buff *skb, int async); int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t); int xfrm6_rcv(struct sk_buff *skb); int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, xfrm_address_t *saddr, u8 proto); void xfrm6_local_error(struct sk_buff *skb, u32 mtu); int xfrm6_rcv_cb(struct sk_buff *skb, u8 protocol, int err); int xfrm6_protocol_register(struct xfrm6_protocol *handler, unsigned char protocol); int xfrm6_protocol_deregister(struct xfrm6_protocol *handler, unsigned char protocol); int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family); int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family); __be32 xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr); __be32 xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr); int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb); int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb); int xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb); int xfrm6_output_finish(struct sock *sk, struct sk_buff *skb); int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb, u8 **prevhdr); #ifdef CONFIG_XFRM int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb); int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen); #else static inline int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen) { return -ENOPROTOOPT; } static inline int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) { /* should not happen */ kfree_skb(skb); return 0; } #endif struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif, const xfrm_address_t *saddr, const xfrm_address_t *daddr, int family, u32 mark); struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp); void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 type); int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk, int (*func)(struct xfrm_policy *, int, int, void*), void *); void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net); int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl); struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, const struct xfrm_mark *mark, u32 if_id, u8 type, int dir, struct xfrm_selector *sel, struct xfrm_sec_ctx *ctx, int delete, int *err); struct xfrm_policy *xfrm_policy_byid(struct net *net, const struct xfrm_mark *mark, u32 if_id, u8 type, int dir, u32 id, int delete, int *err); int xfrm_policy_flush(struct net *net, u8 type, bool task_valid); void xfrm_policy_hash_rebuild(struct net *net); u32 xfrm_get_acqseq(void); int verify_spi_info(u8 proto, u32 min, u32 max); int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi); struct xfrm_state *xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, u8 mode, u32 reqid, u32 if_id, u8 proto, const xfrm_address_t *daddr, const xfrm_address_t *saddr, int create, unsigned short family); int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol); #ifdef CONFIG_XFRM_MIGRATE int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_bundles, const struct xfrm_kmaddress *k, const struct xfrm_encap_tmpl *encap); struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net, u32 if_id); struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x, struct xfrm_migrate *m, struct xfrm_encap_tmpl *encap); int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *m, int num_bundles, struct xfrm_kmaddress *k, struct net *net, struct xfrm_encap_tmpl *encap, u32 if_id); #endif int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport); void km_policy_expired(struct xfrm_policy *pol, int dir, int hard, u32 portid); int km_report(struct net *net, u8 proto, struct xfrm_selector *sel, xfrm_address_t *addr); void xfrm_input_init(void); int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq); void xfrm_probe_algs(void); int xfrm_count_pfkey_auth_supported(void); int xfrm_count_pfkey_enc_supported(void); struct xfrm_algo_desc *xfrm_aalg_get_byidx(unsigned int idx); struct xfrm_algo_desc *xfrm_ealg_get_byidx(unsigned int idx); struct xfrm_algo_desc *xfrm_aalg_get_byid(int alg_id); struct xfrm_algo_desc *xfrm_ealg_get_byid(int alg_id); struct xfrm_algo_desc *xfrm_calg_get_byid(int alg_id); struct xfrm_algo_desc *xfrm_aalg_get_byname(const char *name, int probe); struct xfrm_algo_desc *xfrm_ealg_get_byname(const char *name, int probe); struct xfrm_algo_desc *xfrm_calg_get_byname(const char *name, int probe); struct xfrm_algo_desc *xfrm_aead_get_byname(const char *name, int icv_len, int probe); static inline bool xfrm6_addr_equal(const xfrm_address_t *a, const xfrm_address_t *b) { return ipv6_addr_equal((const struct in6_addr *)a, (const struct in6_addr *)b); } static inline bool xfrm_addr_equal(const xfrm_address_t *a, const xfrm_address_t *b, sa_family_t family) { switch (family) { default: case AF_INET: return ((__force u32)a->a4 ^ (__force u32)b->a4) == 0; case AF_INET6: return xfrm6_addr_equal(a, b); } } static inline int xfrm_policy_id2dir(u32 index) { return index & 7; } #ifdef CONFIG_XFRM static inline int xfrm_aevent_is_on(struct net *net) { struct sock *nlsk; int ret = 0; rcu_read_lock(); nlsk = rcu_dereference(net->xfrm.nlsk); if (nlsk) ret = netlink_has_listeners(nlsk, XFRMNLGRP_AEVENTS); rcu_read_unlock(); return ret; } static inline int xfrm_acquire_is_on(struct net *net) { struct sock *nlsk; int ret = 0; rcu_read_lock(); nlsk = rcu_dereference(net->xfrm.nlsk); if (nlsk) ret = netlink_has_listeners(nlsk, XFRMNLGRP_ACQUIRE); rcu_read_unlock(); return ret; } #endif static inline unsigned int aead_len(struct xfrm_algo_aead *alg) { return sizeof(*alg) + ((alg->alg_key_len + 7) / 8); } static inline unsigned int xfrm_alg_len(const struct xfrm_algo *alg) { return sizeof(*alg) + ((alg->alg_key_len + 7) / 8); } static inline unsigned int xfrm_alg_auth_len(const struct xfrm_algo_auth *alg) { return sizeof(*alg) + ((alg->alg_key_len + 7) / 8); } static inline unsigned int xfrm_replay_state_esn_len(struct xfrm_replay_state_esn *replay_esn) { return sizeof(*replay_esn) + replay_esn->bmp_len * sizeof(__u32); } #ifdef CONFIG_XFRM_MIGRATE static inline int xfrm_replay_clone(struct xfrm_state *x, struct xfrm_state *orig) { x->replay_esn = kmemdup(orig->replay_esn, xfrm_replay_state_esn_len(orig->replay_esn), GFP_KERNEL); if (!x->replay_esn) return -ENOMEM; x->preplay_esn = kmemdup(orig->preplay_esn, xfrm_replay_state_esn_len(orig->preplay_esn), GFP_KERNEL); if (!x->preplay_esn) return -ENOMEM; return 0; } static inline struct xfrm_algo_aead *xfrm_algo_aead_clone(struct xfrm_algo_aead *orig) { return kmemdup(orig, aead_len(orig), GFP_KERNEL); } static inline struct xfrm_algo *xfrm_algo_clone(struct xfrm_algo *orig) { return kmemdup(orig, xfrm_alg_len(orig), GFP_KERNEL); } static inline struct xfrm_algo_auth *xfrm_algo_auth_clone(struct xfrm_algo_auth *orig) { return kmemdup(orig, xfrm_alg_auth_len(orig), GFP_KERNEL); } static inline void xfrm_states_put(struct xfrm_state **states, int n) { int i; for (i = 0; i < n; i++) xfrm_state_put(*(states + i)); } static inline void xfrm_states_delete(struct xfrm_state **states, int n) { int i; for (i = 0; i < n; i++) xfrm_state_delete(*(states + i)); } #endif #ifdef CONFIG_XFRM static inline struct xfrm_state *xfrm_input_state(struct sk_buff *skb) { return skb->sp->xvec[skb->sp->len - 1]; } #endif static inline struct xfrm_offload *xfrm_offload(struct sk_buff *skb) { #ifdef CONFIG_XFRM struct sec_path *sp = skb->sp; if (!sp || !sp->olen || sp->len != sp->olen) return NULL; return &sp->ovec[sp->olen - 1]; #else return NULL; #endif } void __init xfrm_dev_init(void); #ifdef CONFIG_XFRM_OFFLOAD void xfrm_dev_resume(struct sk_buff *skb); void xfrm_dev_backlog(struct softnet_data *sd); struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again); int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct xfrm_user_offload *xuo); bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x); static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x) { struct xfrm_state_offload *xso = &x->xso; if (xso->dev && xso->dev->xfrmdev_ops->xdo_dev_state_advance_esn) xso->dev->xfrmdev_ops->xdo_dev_state_advance_esn(x); } static inline bool xfrm_dst_offload_ok(struct dst_entry *dst) { struct xfrm_state *x = dst->xfrm; struct xfrm_dst *xdst; if (!x || !x->type_offload) return false; xdst = (struct xfrm_dst *) dst; if (!x->xso.offload_handle && !xdst->child->xfrm) return true; if (x->xso.offload_handle && (x->xso.dev == xfrm_dst_path(dst)->dev) && !xdst->child->xfrm) return true; return false; } static inline void xfrm_dev_state_delete(struct xfrm_state *x) { struct xfrm_state_offload *xso = &x->xso; if (xso->dev) xso->dev->xfrmdev_ops->xdo_dev_state_delete(x); } static inline void xfrm_dev_state_free(struct xfrm_state *x) { struct xfrm_state_offload *xso = &x->xso; struct net_device *dev = xso->dev; if (dev && dev->xfrmdev_ops) { if (dev->xfrmdev_ops->xdo_dev_state_free) dev->xfrmdev_ops->xdo_dev_state_free(x); xso->dev = NULL; dev_put(dev); } } #else static inline void xfrm_dev_resume(struct sk_buff *skb) { } static inline void xfrm_dev_backlog(struct softnet_data *sd) { } static inline struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again) { return skb; } static inline int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct xfrm_user_offload *xuo) { return 0; } static inline void xfrm_dev_state_delete(struct xfrm_state *x) { } static inline void xfrm_dev_state_free(struct xfrm_state *x) { } static inline bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) { return false; } static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x) { } static inline bool xfrm_dst_offload_ok(struct dst_entry *dst) { return false; } #endif static inline int xfrm_mark_get(struct nlattr **attrs, struct xfrm_mark *m) { if (attrs[XFRMA_MARK]) memcpy(m, nla_data(attrs[XFRMA_MARK]), sizeof(struct xfrm_mark)); else m->v = m->m = 0; return m->v & m->m; } static inline int xfrm_mark_put(struct sk_buff *skb, const struct xfrm_mark *m) { int ret = 0; if (m->m | m->v) ret = nla_put(skb, XFRMA_MARK, sizeof(struct xfrm_mark), m); return ret; } static inline __u32 xfrm_smark_get(__u32 mark, struct xfrm_state *x) { struct xfrm_mark *m = &x->props.smark; return (m->v & m->m) | (mark & ~m->m); } static inline int xfrm_if_id_put(struct sk_buff *skb, __u32 if_id) { int ret = 0; if (if_id) ret = nla_put_u32(skb, XFRMA_IF_ID, if_id); return ret; } static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x, unsigned int family) { bool tunnel = false; switch(family) { case AF_INET: if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4) tunnel = true; break; case AF_INET6: if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6) tunnel = true; break; } if (tunnel && !(x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL)) return -EINVAL; return 0; } #endif /* _NET_XFRM_H */ |
8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_DMA_MAPPING_H #define _LINUX_DMA_MAPPING_H #include <linux/sizes.h> #include <linux/string.h> #include <linux/device.h> #include <linux/err.h> #include <linux/dma-debug.h> #include <linux/dma-direction.h> #include <linux/scatterlist.h> #include <linux/bug.h> #include <linux/mem_encrypt.h> /** * List of possible attributes associated with a DMA mapping. The semantics * of each attribute should be defined in Documentation/DMA-attributes.txt. * * DMA_ATTR_WRITE_BARRIER: DMA to a memory region with this attribute * forces all pending DMA writes to complete. */ #define DMA_ATTR_WRITE_BARRIER (1UL << 0) /* * DMA_ATTR_WEAK_ORDERING: Specifies that reads and writes to the mapping * may be weakly ordered, that is that reads and writes may pass each other. */ #define DMA_ATTR_WEAK_ORDERING (1UL << 1) /* * DMA_ATTR_WRITE_COMBINE: Specifies that writes to the mapping may be * buffered to improve performance. */ #define DMA_ATTR_WRITE_COMBINE (1UL << 2) /* * DMA_ATTR_NON_CONSISTENT: Lets the platform to choose to return either * consistent or non-consistent memory as it sees fit. */ #define DMA_ATTR_NON_CONSISTENT (1UL << 3) /* * DMA_ATTR_NO_KERNEL_MAPPING: Lets the platform to avoid creating a kernel * virtual mapping for the allocated buffer. */ #define DMA_ATTR_NO_KERNEL_MAPPING (1UL << 4) /* * DMA_ATTR_SKIP_CPU_SYNC: Allows platform code to skip synchronization of * the CPU cache for the given buffer assuming that it has been already * transferred to 'device' domain. */ #define DMA_ATTR_SKIP_CPU_SYNC (1UL << 5) /* * DMA_ATTR_FORCE_CONTIGUOUS: Forces contiguous allocation of the buffer * in physical memory. */ #define DMA_ATTR_FORCE_CONTIGUOUS (1UL << 6) /* * DMA_ATTR_ALLOC_SINGLE_PAGES: This is a hint to the DMA-mapping subsystem * that it's probably not worth the time to try to allocate memory to in a way * that gives better TLB efficiency. */ #define DMA_ATTR_ALLOC_SINGLE_PAGES (1UL << 7) /* * DMA_ATTR_NO_WARN: This tells the DMA-mapping subsystem to suppress * allocation failure reports (similarly to __GFP_NOWARN). */ #define DMA_ATTR_NO_WARN (1UL << 8) /* * DMA_ATTR_PRIVILEGED: used to indicate that the buffer is fully * accessible at an elevated privilege level (and ideally inaccessible or * at least read-only at lesser-privileged levels). */ #define DMA_ATTR_PRIVILEGED (1UL << 9) /* * A dma_addr_t can hold any valid DMA or bus address for the platform. * It can be given to a device to use as a DMA source or target. A CPU cannot * reference a dma_addr_t directly because there may be translation between * its physical address space and the bus address space. */ struct dma_map_ops { void* (*alloc)(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs); void (*free)(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, unsigned long attrs); int (*mmap)(struct device *, struct vm_area_struct *, void *, dma_addr_t, size_t, unsigned long attrs); int (*get_sgtable)(struct device *dev, struct sg_table *sgt, void *, dma_addr_t, size_t, unsigned long attrs); dma_addr_t (*map_page)(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, unsigned long attrs); void (*unmap_page)(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir, unsigned long attrs); /* * map_sg returns 0 on error and a value > 0 on success. * It should never return a value < 0. */ int (*map_sg)(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs); void (*unmap_sg)(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs); dma_addr_t (*map_resource)(struct device *dev, phys_addr_t phys_addr, size_t size, enum dma_data_direction dir, unsigned long attrs); void (*unmap_resource)(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir, unsigned long attrs); void (*sync_single_for_cpu)(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir); void (*sync_single_for_device)(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir); void (*sync_sg_for_cpu)(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir); void (*sync_sg_for_device)(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir); void (*cache_sync)(struct device *dev, void *vaddr, size_t size, enum dma_data_direction direction); int (*mapping_error)(struct device *dev, dma_addr_t dma_addr); int (*dma_supported)(struct device *dev, u64 mask); #ifdef ARCH_HAS_DMA_GET_REQUIRED_MASK u64 (*get_required_mask)(struct device *dev); #endif }; extern const struct dma_map_ops dma_direct_ops; extern const struct dma_map_ops dma_noncoherent_ops; extern const struct dma_map_ops dma_virt_ops; #define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : ((1ULL<<(n))-1)) #define DMA_MASK_NONE 0x0ULL static inline int valid_dma_direction(int dma_direction) { return ((dma_direction == DMA_BIDIRECTIONAL) || (dma_direction == DMA_TO_DEVICE) || (dma_direction == DMA_FROM_DEVICE)); } static inline int is_device_dma_capable(struct device *dev) { return dev->dma_mask != NULL && *dev->dma_mask != DMA_MASK_NONE; } #ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT /* * These three functions are only for dma allocator. * Don't use them in device drivers. */ int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size, dma_addr_t *dma_handle, void **ret); int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr); int dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, size_t size, int *ret); void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle); int dma_release_from_global_coherent(int order, void *vaddr); int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *cpu_addr, size_t size, int *ret); #else #define dma_alloc_from_dev_coherent(dev, size, handle, ret) (0) #define dma_release_from_dev_coherent(dev, order, vaddr) (0) #define dma_mmap_from_dev_coherent(dev, vma, vaddr, order, ret) (0) static inline void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle) { return NULL; } static inline int dma_release_from_global_coherent(int order, void *vaddr) { return 0; } static inline int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *cpu_addr, size_t size, int *ret) { return 0; } #endif /* CONFIG_HAVE_GENERIC_DMA_COHERENT */ #ifdef CONFIG_HAS_DMA #include <asm/dma-mapping.h> static inline const struct dma_map_ops *get_dma_ops(struct device *dev) { if (dev && dev->dma_ops) return dev->dma_ops; return get_arch_dma_ops(dev ? dev->bus : NULL); } static inline void set_dma_ops(struct device *dev, const struct dma_map_ops *dma_ops) { dev->dma_ops = dma_ops; } #else /* * Define the dma api to allow compilation of dma dependent code. * Code that depends on the dma-mapping API needs to set 'depends on HAS_DMA' * in its Kconfig, unless it already depends on <something> || COMPILE_TEST, * where <something> guarantuees the availability of the dma-mapping API. */ static inline const struct dma_map_ops *get_dma_ops(struct device *dev) { return NULL; } #endif static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr, size_t size, enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); dma_addr_t addr; BUG_ON(!valid_dma_direction(dir)); addr = ops->map_page(dev, virt_to_page(ptr), offset_in_page(ptr), size, dir, attrs); debug_dma_map_page(dev, virt_to_page(ptr), offset_in_page(ptr), size, dir, addr, true); return addr; } static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); if (ops->unmap_page) ops->unmap_page(dev, addr, size, dir, attrs); debug_dma_unmap_page(dev, addr, size, dir, true); } /* * dma_maps_sg_attrs returns 0 on error and > 0 on success. * It should never return a value < 0. */ static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); int ents; BUG_ON(!valid_dma_direction(dir)); ents = ops->map_sg(dev, sg, nents, dir, attrs); BUG_ON(ents < 0); debug_dma_map_sg(dev, sg, nents, ents, dir); return ents; } static inline void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); debug_dma_unmap_sg(dev, sg, nents, dir); if (ops->unmap_sg) ops->unmap_sg(dev, sg, nents, dir, attrs); } static inline dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, size_t offset, size_t size, enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); dma_addr_t addr; BUG_ON(!valid_dma_direction(dir)); addr = ops->map_page(dev, page, offset, size, dir, attrs); debug_dma_map_page(dev, page, offset, size, dir, addr, false); return addr; } static inline void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); if (ops->unmap_page) ops->unmap_page(dev, addr, size, dir, attrs); debug_dma_unmap_page(dev, addr, size, dir, false); } static inline dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); dma_addr_t addr; BUG_ON(!valid_dma_direction(dir)); /* Don't allow RAM to be mapped */ BUG_ON(pfn_valid(PHYS_PFN(phys_addr))); addr = phys_addr; if (ops->map_resource) addr = ops->map_resource(dev, phys_addr, size, dir, attrs); debug_dma_map_resource(dev, phys_addr, size, dir, addr); return addr; } static inline void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); if (ops->unmap_resource) ops->unmap_resource(dev, addr, size, dir, attrs); debug_dma_unmap_resource(dev, addr, size, dir); } static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); if (ops->sync_single_for_cpu) ops->sync_single_for_cpu(dev, addr, size, dir); debug_dma_sync_single_for_cpu(dev, addr, size, dir); } static inline void dma_sync_single_for_device(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); if (ops->sync_single_for_device) ops->sync_single_for_device(dev, addr, size, dir); debug_dma_sync_single_for_device(dev, addr, size, dir); } static inline void dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t addr, unsigned long offset, size_t size, enum dma_data_direction dir) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); if (ops->sync_single_for_cpu) ops->sync_single_for_cpu(dev, addr + offset, size, dir); debug_dma_sync_single_range_for_cpu(dev, addr, offset, size, dir); } static inline void dma_sync_single_range_for_device(struct device *dev, dma_addr_t addr, unsigned long offset, size_t size, enum dma_data_direction dir) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); if (ops->sync_single_for_device) ops->sync_single_for_device(dev, addr + offset, size, dir); debug_dma_sync_single_range_for_device(dev, addr, offset, size, dir); } static inline void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction dir) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); if (ops->sync_sg_for_cpu) ops->sync_sg_for_cpu(dev, sg, nelems, dir); debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir); } static inline void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction dir) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); if (ops->sync_sg_for_device) ops->sync_sg_for_device(dev, sg, nelems, dir); debug_dma_sync_sg_for_device(dev, sg, nelems, dir); } #define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, 0) #define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, 0) #define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, 0) #define dma_unmap_sg(d, s, n, r) dma_unmap_sg_attrs(d, s, n, r, 0) #define dma_map_page(d, p, o, s, r) dma_map_page_attrs(d, p, o, s, r, 0) #define dma_unmap_page(d, a, s, r) dma_unmap_page_attrs(d, a, s, r, 0) static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size, enum dma_data_direction dir) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); if (ops->cache_sync) ops->cache_sync(dev, vaddr, size, dir); } extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size); void *dma_common_contiguous_remap(struct page *page, size_t size, unsigned long vm_flags, pgprot_t prot, const void *caller); void *dma_common_pages_remap(struct page **pages, size_t size, unsigned long vm_flags, pgprot_t prot, const void *caller); void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags); /** * dma_mmap_attrs - map a coherent DMA allocation into user space * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices * @vma: vm_area_struct describing requested user mapping * @cpu_addr: kernel CPU-view address returned from dma_alloc_attrs * @handle: device-view address returned from dma_alloc_attrs * @size: size of memory originally requested in dma_alloc_attrs * @attrs: attributes of mapping properties requested in dma_alloc_attrs * * Map a coherent DMA buffer previously allocated by dma_alloc_attrs * into user space. The coherent DMA buffer must not be freed by the * driver until the user space mapping has been released. */ static inline int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!ops); if (ops->mmap) return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs); return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size); } #define dma_mmap_coherent(d, v, c, h, s) dma_mmap_attrs(d, v, c, h, s, 0) int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size); static inline int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!ops); if (ops->get_sgtable) return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size, attrs); return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr, size); } #define dma_get_sgtable(d, t, v, h, s) dma_get_sgtable_attrs(d, t, v, h, s, 0) #ifndef arch_dma_alloc_attrs #define arch_dma_alloc_attrs(dev) (true) #endif static inline void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); void *cpu_addr; BUG_ON(!ops); WARN_ON_ONCE(dev && !dev->coherent_dma_mask); if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr)) return cpu_addr; /* let the implementation decide on the zone to allocate from: */ flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM); if (!arch_dma_alloc_attrs(&dev)) return NULL; if (!ops->alloc) return NULL; cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs); debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr); return cpu_addr; } static inline void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_handle, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!ops); if (dma_release_from_dev_coherent(dev, get_order(size), cpu_addr)) return; /* * On non-coherent platforms which implement DMA-coherent buffers via * non-cacheable remaps, ops->free() may call vunmap(). Thus getting * this far in IRQ context is a) at risk of a BUG_ON() or trying to * sleep on some machines, and b) an indication that the driver is * probably misusing the coherent API anyway. */ WARN_ON(irqs_disabled()); if (!ops->free || !cpu_addr) return; debug_dma_free_coherent(dev, size, cpu_addr, dma_handle); ops->free(dev, size, cpu_addr, dma_handle, attrs); } static inline void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag) { return dma_alloc_attrs(dev, size, dma_handle, flag, 0); } static inline void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_handle) { return dma_free_attrs(dev, size, cpu_addr, dma_handle, 0); } static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { const struct dma_map_ops *ops = get_dma_ops(dev); debug_dma_mapping_error(dev, dma_addr); if (ops->mapping_error) return ops->mapping_error(dev, dma_addr); return 0; } static inline void dma_check_mask(struct device *dev, u64 mask) { if (sme_active() && (mask < (((u64)sme_get_me_mask() << 1) - 1))) dev_warn(dev, "SME is active, device will require DMA bounce buffers\n"); } static inline int dma_supported(struct device *dev, u64 mask) { const struct dma_map_ops *ops = get_dma_ops(dev); if (!ops) return 0; if (!ops->dma_supported) return 1; return ops->dma_supported(dev, mask); } #ifndef HAVE_ARCH_DMA_SET_MASK static inline int dma_set_mask(struct device *dev, u64 mask) { if (!dev->dma_mask || !dma_supported(dev, mask)) return -EIO; dma_check_mask(dev, mask); *dev->dma_mask = mask; return 0; } #endif static inline u64 dma_get_mask(struct device *dev) { if (dev && dev->dma_mask && *dev->dma_mask) return *dev->dma_mask; return DMA_BIT_MASK(32); } #ifdef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK int dma_set_coherent_mask(struct device *dev, u64 mask); #else static inline int dma_set_coherent_mask(struct device *dev, u64 mask) { if (!dma_supported(dev, mask)) return -EIO; dma_check_mask(dev, mask); dev->coherent_dma_mask = mask; return 0; } #endif /* * Set both the DMA mask and the coherent DMA mask to the same thing. * Note that we don't check the return value from dma_set_coherent_mask() * as the DMA API guarantees that the coherent DMA mask can be set to * the same or smaller than the streaming DMA mask. */ static inline int dma_set_mask_and_coherent(struct device *dev, u64 mask) { int rc = dma_set_mask(dev, mask); if (rc == 0) dma_set_coherent_mask(dev, mask); return rc; } /* * Similar to the above, except it deals with the case where the device * does not have dev->dma_mask appropriately setup. */ static inline int dma_coerce_mask_and_coherent(struct device *dev, u64 mask) { dev->dma_mask = &dev->coherent_dma_mask; return dma_set_mask_and_coherent(dev, mask); } extern u64 dma_get_required_mask(struct device *dev); #ifndef arch_setup_dma_ops static inline void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, const struct iommu_ops *iommu, bool coherent) { } #endif #ifndef arch_teardown_dma_ops static inline void arch_teardown_dma_ops(struct device *dev) { } #endif static inline unsigned int dma_get_max_seg_size(struct device *dev) { if (dev->dma_parms && dev->dma_parms->max_segment_size) return dev->dma_parms->max_segment_size; return SZ_64K; } static inline int dma_set_max_seg_size(struct device *dev, unsigned int size) { if (dev->dma_parms) { dev->dma_parms->max_segment_size = size; return 0; } return -EIO; } static inline unsigned long dma_get_seg_boundary(struct device *dev) { if (dev->dma_parms && dev->dma_parms->segment_boundary_mask) return dev->dma_parms->segment_boundary_mask; return DMA_BIT_MASK(32); } static inline int dma_set_seg_boundary(struct device *dev, unsigned long mask) { if (dev->dma_parms) { dev->dma_parms->segment_boundary_mask = mask; return 0; } return -EIO; } #ifndef dma_max_pfn static inline unsigned long dma_max_pfn(struct device *dev) { return (*dev->dma_mask >> PAGE_SHIFT) + dev->dma_pfn_offset; } #endif static inline void *dma_zalloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag) { void *ret = dma_alloc_coherent(dev, size, dma_handle, flag | __GFP_ZERO); return ret; } static inline int dma_get_cache_alignment(void) { #ifdef ARCH_DMA_MINALIGN return ARCH_DMA_MINALIGN; #endif return 1; } /* flags for the coherent memory api */ #define DMA_MEMORY_EXCLUSIVE 0x01 #ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, dma_addr_t device_addr, size_t size, int flags); void dma_release_declared_memory(struct device *dev); void *dma_mark_declared_memory_occupied(struct device *dev, dma_addr_t device_addr, size_t size); #else static inline int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, dma_addr_t device_addr, size_t size, int flags) { return -ENOSYS; } static inline void dma_release_declared_memory(struct device *dev) { } static inline void * dma_mark_declared_memory_occupied(struct device *dev, dma_addr_t device_addr, size_t size) { return ERR_PTR(-EBUSY); } #endif /* CONFIG_HAVE_GENERIC_DMA_COHERENT */ #ifdef CONFIG_HAS_DMA int dma_configure(struct device *dev); void dma_deconfigure(struct device *dev); #else static inline int dma_configure(struct device *dev) { return 0; } static inline void dma_deconfigure(struct device *dev) {} #endif /* * Managed DMA API */ #ifdef CONFIG_HAS_DMA extern void *dmam_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp); extern void dmam_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle); #else /* !CONFIG_HAS_DMA */ static inline void *dmam_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) { return NULL; } static inline void dmam_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle) { } #endif /* !CONFIG_HAS_DMA */ extern void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs); #ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT extern int dmam_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, dma_addr_t device_addr, size_t size, int flags); extern void dmam_release_declared_memory(struct device *dev); #else /* CONFIG_HAVE_GENERIC_DMA_COHERENT */ static inline int dmam_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, dma_addr_t device_addr, size_t size, gfp_t gfp) { return 0; } static inline void dmam_release_declared_memory(struct device *dev) { } #endif /* CONFIG_HAVE_GENERIC_DMA_COHERENT */ static inline void *dma_alloc_wc(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t gfp) { return dma_alloc_attrs(dev, size, dma_addr, gfp, DMA_ATTR_WRITE_COMBINE); } #ifndef dma_alloc_writecombine #define dma_alloc_writecombine dma_alloc_wc #endif static inline void dma_free_wc(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr) { return dma_free_attrs(dev, size, cpu_addr, dma_addr, DMA_ATTR_WRITE_COMBINE); } #ifndef dma_free_writecombine #define dma_free_writecombine dma_free_wc #endif static inline int dma_mmap_wc(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size) { return dma_mmap_attrs(dev, vma, cpu_addr, dma_addr, size, DMA_ATTR_WRITE_COMBINE); } #ifndef dma_mmap_writecombine #define dma_mmap_writecombine dma_mmap_wc #endif #ifdef CONFIG_NEED_DMA_MAP_STATE #define DEFINE_DMA_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME #define DEFINE_DMA_UNMAP_LEN(LEN_NAME) __u32 LEN_NAME #define dma_unmap_addr(PTR, ADDR_NAME) ((PTR)->ADDR_NAME) #define dma_unmap_addr_set(PTR, ADDR_NAME, VAL) (((PTR)->ADDR_NAME) = (VAL)) #define dma_unmap_len(PTR, LEN_NAME) ((PTR)->LEN_NAME) #define dma_unmap_len_set(PTR, LEN_NAME, VAL) (((PTR)->LEN_NAME) = (VAL)) #else #define DEFINE_DMA_UNMAP_ADDR(ADDR_NAME) #define DEFINE_DMA_UNMAP_LEN(LEN_NAME) #define dma_unmap_addr(PTR, ADDR_NAME) (0) #define dma_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0) #define dma_unmap_len(PTR, LEN_NAME) (0) #define dma_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0) #endif #endif |
20 26 20 17 17 17 10 8 8 1 1 25 2 24 4 21 21 1 11 11 4 7 7 11 11 10 1 11 11 3 1 1 19 1 18 18 13 18 18 15 15 15 1 4 17 18 4 4 14 4 19 3 2 1 1 3 18 18 1 5 5 14 2 14 2 1 9 1 10 11 10 9 21 21 1 5 2 3 6 1 6 1 3 6 32 32 32 32 32 32 32 18 22 22 22 12 22 89 5 3 2 9 2 7 6 2 2 1 2 2 4 4 3 3 2 2 3 1 21 21 21 21 21 21 29 29 28 28 27 27 28 29 43 43 40 41 40 41 41 43 22 22 22 22 8 22 22 61 61 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 | /* * videobuf2-v4l2.c - V4L2 driver helper framework * * Copyright (C) 2010 Samsung Electronics * * Author: Pawel Osciak <pawel@osciak.com> * Marek Szyprowski <m.szyprowski@samsung.com> * * The vb2_thread implementation was based on code from videobuf-dvb.c: * (c) 2004 Gerd Knorr <kraxel@bytesex.org> [SUSE Labs] * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation. */ #include <linux/err.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/poll.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/freezer.h> #include <linux/kthread.h> #include <media/v4l2-dev.h> #include <media/v4l2-fh.h> #include <media/v4l2-event.h> #include <media/v4l2-common.h> #include <media/videobuf2-v4l2.h> static int debug; module_param(debug, int, 0644); #define dprintk(level, fmt, arg...) \ do { \ if (debug >= level) \ pr_info("vb2-v4l2: %s: " fmt, __func__, ## arg); \ } while (0) /* Flags that are set by the vb2 core */ #define V4L2_BUFFER_MASK_FLAGS (V4L2_BUF_FLAG_MAPPED | V4L2_BUF_FLAG_QUEUED | \ V4L2_BUF_FLAG_DONE | V4L2_BUF_FLAG_ERROR | \ V4L2_BUF_FLAG_PREPARED | \ V4L2_BUF_FLAG_TIMESTAMP_MASK) /* Output buffer flags that should be passed on to the driver */ #define V4L2_BUFFER_OUT_FLAGS (V4L2_BUF_FLAG_PFRAME | V4L2_BUF_FLAG_BFRAME | \ V4L2_BUF_FLAG_KEYFRAME | V4L2_BUF_FLAG_TIMECODE) /* * __verify_planes_array() - verify that the planes array passed in struct * v4l2_buffer from userspace can be safely used */ static int __verify_planes_array(struct vb2_buffer *vb, const struct v4l2_buffer *b) { if (!V4L2_TYPE_IS_MULTIPLANAR(b->type)) return 0; /* Is memory for copying plane information present? */ if (b->m.planes == NULL) { dprintk(1, "multi-planar buffer passed but planes array not provided\n"); return -EINVAL; } if (b->length < vb->num_planes || b->length > VB2_MAX_PLANES) { dprintk(1, "incorrect planes array length, expected %d, got %d\n", vb->num_planes, b->length); return -EINVAL; } return 0; } static int __verify_planes_array_core(struct vb2_buffer *vb, const void *pb) { return __verify_planes_array(vb, pb); } /* * __verify_length() - Verify that the bytesused value for each plane fits in * the plane length and that the data offset doesn't exceed the bytesused value. */ static int __verify_length(struct vb2_buffer *vb, const struct v4l2_buffer *b) { unsigned int length; unsigned int bytesused; unsigned int plane; if (!V4L2_TYPE_IS_OUTPUT(b->type)) return 0; if (V4L2_TYPE_IS_MULTIPLANAR(b->type)) { for (plane = 0; plane < vb->num_planes; ++plane) { length = (b->memory == VB2_MEMORY_USERPTR || b->memory == VB2_MEMORY_DMABUF) ? b->m.planes[plane].length : vb->planes[plane].length; bytesused = b->m.planes[plane].bytesused ? b->m.planes[plane].bytesused : length; if (b->m.planes[plane].bytesused > length) return -EINVAL; if (b->m.planes[plane].data_offset > 0 && b->m.planes[plane].data_offset >= bytesused) return -EINVAL; } } else { length = (b->memory == VB2_MEMORY_USERPTR) ? b->length : vb->planes[0].length; if (b->bytesused > length) return -EINVAL; } return 0; } static void __copy_timestamp(struct vb2_buffer *vb, const void *pb) { const struct v4l2_buffer *b = pb; struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); struct vb2_queue *q = vb->vb2_queue; if (q->is_output) { /* * For output buffers copy the timestamp if needed, * and the timecode field and flag if needed. */ if (q->copy_timestamp) vb->timestamp = timeval_to_ns(&b->timestamp); vbuf->flags |= b->flags & V4L2_BUF_FLAG_TIMECODE; if (b->flags & V4L2_BUF_FLAG_TIMECODE) vbuf->timecode = b->timecode; } }; static void vb2_warn_zero_bytesused(struct vb2_buffer *vb) { static bool check_once; if (check_once) return; check_once = true; pr_warn("use of bytesused == 0 is deprecated and will be removed in the future,\n"); if (vb->vb2_queue->allow_zero_bytesused) pr_warn("use VIDIOC_DECODER_CMD(V4L2_DEC_CMD_STOP) instead.\n"); else pr_warn("use the actual size instead.\n"); } static int vb2_queue_or_prepare_buf(struct vb2_queue *q, struct v4l2_buffer *b, const char *opname) { if (b->type != q->type) { dprintk(1, "%s: invalid buffer type\n", opname); return -EINVAL; } if (b->index >= q->num_buffers) { dprintk(1, "%s: buffer index out of range\n", opname); return -EINVAL; } if (q->bufs[b->index] == NULL) { /* Should never happen */ dprintk(1, "%s: buffer is NULL\n", opname); return -EINVAL; } if (b->memory != q->memory) { dprintk(1, "%s: invalid memory type\n", opname); return -EINVAL; } return __verify_planes_array(q->bufs[b->index], b); } /* * __fill_v4l2_buffer() - fill in a struct v4l2_buffer with information to be * returned to userspace */ static void __fill_v4l2_buffer(struct vb2_buffer *vb, void *pb) { struct v4l2_buffer *b = pb; struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); struct vb2_queue *q = vb->vb2_queue; unsigned int plane; /* Copy back data such as timestamp, flags, etc. */ b->index = vb->index; b->type = vb->type; b->memory = vb->memory; b->bytesused = 0; b->flags = vbuf->flags; b->field = vbuf->field; b->timestamp = ns_to_timeval(vb->timestamp); b->timecode = vbuf->timecode; b->sequence = vbuf->sequence; b->reserved2 = 0; b->reserved = 0; if (q->is_multiplanar) { /* * Fill in plane-related data if userspace provided an array * for it. The caller has already verified memory and size. */ b->length = vb->num_planes; for (plane = 0; plane < vb->num_planes; ++plane) { struct v4l2_plane *pdst = &b->m.planes[plane]; struct vb2_plane *psrc = &vb->planes[plane]; pdst->bytesused = psrc->bytesused; pdst->length = psrc->length; if (q->memory == VB2_MEMORY_MMAP) pdst->m.mem_offset = psrc->m.offset; else if (q->memory == VB2_MEMORY_USERPTR) pdst->m.userptr = psrc->m.userptr; else if (q->memory == VB2_MEMORY_DMABUF) pdst->m.fd = psrc->m.fd; pdst->data_offset = psrc->data_offset; memset(pdst->reserved, 0, sizeof(pdst->reserved)); } } else { /* * We use length and offset in v4l2_planes array even for * single-planar buffers, but userspace does not. */ b->length = vb->planes[0].length; b->bytesused = vb->planes[0].bytesused; if (q->memory == VB2_MEMORY_MMAP) b->m.offset = vb->planes[0].m.offset; else if (q->memory == VB2_MEMORY_USERPTR) b->m.userptr = vb->planes[0].m.userptr; else if (q->memory == VB2_MEMORY_DMABUF) b->m.fd = vb->planes[0].m.fd; } /* * Clear any buffer state related flags. */ b->flags &= ~V4L2_BUFFER_MASK_FLAGS; b->flags |= q->timestamp_flags & V4L2_BUF_FLAG_TIMESTAMP_MASK; if (!q->copy_timestamp) { /* * For non-COPY timestamps, drop timestamp source bits * and obtain the timestamp source from the queue. */ b->flags &= ~V4L2_BUF_FLAG_TSTAMP_SRC_MASK; b->flags |= q->timestamp_flags & V4L2_BUF_FLAG_TSTAMP_SRC_MASK; } switch (vb->state) { case VB2_BUF_STATE_QUEUED: case VB2_BUF_STATE_ACTIVE: b->flags |= V4L2_BUF_FLAG_QUEUED; break; case VB2_BUF_STATE_ERROR: b->flags |= V4L2_BUF_FLAG_ERROR; /* fall through */ case VB2_BUF_STATE_DONE: b->flags |= V4L2_BUF_FLAG_DONE; break; case VB2_BUF_STATE_PREPARED: b->flags |= V4L2_BUF_FLAG_PREPARED; break; case VB2_BUF_STATE_PREPARING: case VB2_BUF_STATE_DEQUEUED: case VB2_BUF_STATE_REQUEUEING: /* nothing */ break; } if (vb2_buffer_in_use(q, vb)) b->flags |= V4L2_BUF_FLAG_MAPPED; if (!q->is_output && b->flags & V4L2_BUF_FLAG_DONE && b->flags & V4L2_BUF_FLAG_LAST) q->last_buffer_dequeued = true; } /* * __fill_vb2_buffer() - fill a vb2_buffer with information provided in a * v4l2_buffer by the userspace. It also verifies that struct * v4l2_buffer has a valid number of planes. */ static int __fill_vb2_buffer(struct vb2_buffer *vb, const void *pb, struct vb2_plane *planes) { struct vb2_queue *q = vb->vb2_queue; const struct v4l2_buffer *b = pb; struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); unsigned int plane; int ret; ret = __verify_length(vb, b); if (ret < 0) { dprintk(1, "plane parameters verification failed: %d\n", ret); return ret; } if (b->field == V4L2_FIELD_ALTERNATE && q->is_output) { /* * If the format's field is ALTERNATE, then the buffer's field * should be either TOP or BOTTOM, not ALTERNATE since that * makes no sense. The driver has to know whether the * buffer represents a top or a bottom field in order to * program any DMA correctly. Using ALTERNATE is wrong, since * that just says that it is either a top or a bottom field, * but not which of the two it is. */ dprintk(1, "the field is incorrectly set to ALTERNATE for an output buffer\n"); return -EINVAL; } vb->timestamp = 0; vbuf->sequence = 0; if (V4L2_TYPE_IS_MULTIPLANAR(b->type)) { if (b->memory == VB2_MEMORY_USERPTR) { for (plane = 0; plane < vb->num_planes; ++plane) { planes[plane].m.userptr = b->m.planes[plane].m.userptr; planes[plane].length = b->m.planes[plane].length; } } if (b->memory == VB2_MEMORY_DMABUF) { for (plane = 0; plane < vb->num_planes; ++plane) { planes[plane].m.fd = b->m.planes[plane].m.fd; planes[plane].length = b->m.planes[plane].length; } } /* Fill in driver-provided information for OUTPUT types */ if (V4L2_TYPE_IS_OUTPUT(b->type)) { /* * Will have to go up to b->length when API starts * accepting variable number of planes. * * If bytesused == 0 for the output buffer, then fall * back to the full buffer size. In that case * userspace clearly never bothered to set it and * it's a safe assumption that they really meant to * use the full plane sizes. * * Some drivers, e.g. old codec drivers, use bytesused == 0 * as a way to indicate that streaming is finished. * In that case, the driver should use the * allow_zero_bytesused flag to keep old userspace * applications working. */ for (plane = 0; plane < vb->num_planes; ++plane) { struct vb2_plane *pdst = &planes[plane]; struct v4l2_plane *psrc = &b->m.planes[plane]; if (psrc->bytesused == 0) vb2_warn_zero_bytesused(vb); if (vb->vb2_queue->allow_zero_bytesused) pdst->bytesused = psrc->bytesused; else pdst->bytesused = psrc->bytesused ? psrc->bytesused : pdst->length; pdst->data_offset = psrc->data_offset; } } } else { /* * Single-planar buffers do not use planes array, * so fill in relevant v4l2_buffer struct fields instead. * In videobuf we use our internal V4l2_planes struct for * single-planar buffers as well, for simplicity. * * If bytesused == 0 for the output buffer, then fall back * to the full buffer size as that's a sensible default. * * Some drivers, e.g. old codec drivers, use bytesused == 0 as * a way to indicate that streaming is finished. In that case, * the driver should use the allow_zero_bytesused flag to keep * old userspace applications working. */ if (b->memory == VB2_MEMORY_USERPTR) { planes[0].m.userptr = b->m.userptr; planes[0].length = b->length; } if (b->memory == VB2_MEMORY_DMABUF) { planes[0].m.fd = b->m.fd; planes[0].length = b->length; } if (V4L2_TYPE_IS_OUTPUT(b->type)) { if (b->bytesused == 0) vb2_warn_zero_bytesused(vb); if (vb->vb2_queue->allow_zero_bytesused) planes[0].bytesused = b->bytesused; else planes[0].bytesused = b->bytesused ? b->bytesused : planes[0].length; } else planes[0].bytesused = 0; } /* Zero flags that the vb2 core handles */ vbuf->flags = b->flags & ~V4L2_BUFFER_MASK_FLAGS; if (!vb->vb2_queue->copy_timestamp || !V4L2_TYPE_IS_OUTPUT(b->type)) { /* * Non-COPY timestamps and non-OUTPUT queues will get * their timestamp and timestamp source flags from the * queue. */ vbuf->flags &= ~V4L2_BUF_FLAG_TSTAMP_SRC_MASK; } if (V4L2_TYPE_IS_OUTPUT(b->type)) { /* * For output buffers mask out the timecode flag: * this will be handled later in vb2_qbuf(). * The 'field' is valid metadata for this output buffer * and so that needs to be copied here. */ vbuf->flags &= ~V4L2_BUF_FLAG_TIMECODE; vbuf->field = b->field; } else { /* Zero any output buffer flags as this is a capture buffer */ vbuf->flags &= ~V4L2_BUFFER_OUT_FLAGS; /* Zero last flag, this is a signal from driver to userspace */ vbuf->flags &= ~V4L2_BUF_FLAG_LAST; } return 0; } static const struct vb2_buf_ops v4l2_buf_ops = { .verify_planes_array = __verify_planes_array_core, .fill_user_buffer = __fill_v4l2_buffer, .fill_vb2_buffer = __fill_vb2_buffer, .copy_timestamp = __copy_timestamp, }; /* * vb2_querybuf() - query video buffer information * @q: videobuf queue * @b: buffer struct passed from userspace to vidioc_querybuf handler * in driver * * Should be called from vidioc_querybuf ioctl handler in driver. * This function will verify the passed v4l2_buffer structure and fill the * relevant information for the userspace. * * The return values from this function are intended to be directly returned * from vidioc_querybuf handler in driver. */ int vb2_querybuf(struct vb2_queue *q, struct v4l2_buffer *b) { struct vb2_buffer *vb; int ret; if (b->type != q->type) { dprintk(1, "wrong buffer type\n"); return -EINVAL; } if (b->index >= q->num_buffers) { dprintk(1, "buffer index out of range\n"); return -EINVAL; } vb = q->bufs[b->index]; ret = __verify_planes_array(vb, b); if (!ret) vb2_core_querybuf(q, b->index, b); return ret; } EXPORT_SYMBOL(vb2_querybuf); int vb2_reqbufs(struct vb2_queue *q, struct v4l2_requestbuffers *req) { int ret = vb2_verify_memory_type(q, req->memory, req->type); return ret ? ret : vb2_core_reqbufs(q, req->memory, &req->count); } EXPORT_SYMBOL_GPL(vb2_reqbufs); int vb2_prepare_buf(struct vb2_queue *q, struct v4l2_buffer *b) { int ret; if (vb2_fileio_is_active(q)) { dprintk(1, "file io in progress\n"); return -EBUSY; } ret = vb2_queue_or_prepare_buf(q, b, "prepare_buf"); return ret ? ret : vb2_core_prepare_buf(q, b->index, b); } EXPORT_SYMBOL_GPL(vb2_prepare_buf); int vb2_create_bufs(struct vb2_queue *q, struct v4l2_create_buffers *create) { unsigned requested_planes = 1; unsigned requested_sizes[VIDEO_MAX_PLANES]; struct v4l2_format *f = &create->format; int ret = vb2_verify_memory_type(q, create->memory, f->type); unsigned i; create->index = q->num_buffers; if (create->count == 0) return ret != -EBUSY ? ret : 0; switch (f->type) { case V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE: case V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE: requested_planes = f->fmt.pix_mp.num_planes; if (requested_planes == 0 || requested_planes > VIDEO_MAX_PLANES) return -EINVAL; for (i = 0; i < requested_planes; i++) requested_sizes[i] = f->fmt.pix_mp.plane_fmt[i].sizeimage; break; case V4L2_BUF_TYPE_VIDEO_CAPTURE: case V4L2_BUF_TYPE_VIDEO_OUTPUT: requested_sizes[0] = f->fmt.pix.sizeimage; break; case V4L2_BUF_TYPE_VBI_CAPTURE: case V4L2_BUF_TYPE_VBI_OUTPUT: requested_sizes[0] = f->fmt.vbi.samples_per_line * (f->fmt.vbi.count[0] + f->fmt.vbi.count[1]); break; case V4L2_BUF_TYPE_SLICED_VBI_CAPTURE: case V4L2_BUF_TYPE_SLICED_VBI_OUTPUT: requested_sizes[0] = f->fmt.sliced.io_size; break; case V4L2_BUF_TYPE_SDR_CAPTURE: case V4L2_BUF_TYPE_SDR_OUTPUT: requested_sizes[0] = f->fmt.sdr.buffersize; break; case V4L2_BUF_TYPE_META_CAPTURE: requested_sizes[0] = f->fmt.meta.buffersize; break; default: return -EINVAL; } for (i = 0; i < requested_planes; i++) if (requested_sizes[i] == 0) return -EINVAL; return ret ? ret : vb2_core_create_bufs(q, create->memory, &create->count, requested_planes, requested_sizes); } EXPORT_SYMBOL_GPL(vb2_create_bufs); int vb2_qbuf(struct vb2_queue *q, struct v4l2_buffer *b) { int ret; if (vb2_fileio_is_active(q)) { dprintk(1, "file io in progress\n"); return -EBUSY; } ret = vb2_queue_or_prepare_buf(q, b, "qbuf"); return ret ? ret : vb2_core_qbuf(q, b->index, b); } EXPORT_SYMBOL_GPL(vb2_qbuf); int vb2_dqbuf(struct vb2_queue *q, struct v4l2_buffer *b, bool nonblocking) { int ret; if (vb2_fileio_is_active(q)) { dprintk(1, "file io in progress\n"); return -EBUSY; } if (b->type != q->type) { dprintk(1, "invalid buffer type\n"); return -EINVAL; } ret = vb2_core_dqbuf(q, NULL, b, nonblocking); /* * After calling the VIDIOC_DQBUF V4L2_BUF_FLAG_DONE must be * cleared. */ b->flags &= ~V4L2_BUF_FLAG_DONE; return ret; } EXPORT_SYMBOL_GPL(vb2_dqbuf); int vb2_streamon(struct vb2_queue *q, enum v4l2_buf_type type) { if (vb2_fileio_is_active(q)) { dprintk(1, "file io in progress\n"); return -EBUSY; } return vb2_core_streamon(q, type); } EXPORT_SYMBOL_GPL(vb2_streamon); int vb2_streamoff(struct vb2_queue *q, enum v4l2_buf_type type) { if (vb2_fileio_is_active(q)) { dprintk(1, "file io in progress\n"); return -EBUSY; } return vb2_core_streamoff(q, type); } EXPORT_SYMBOL_GPL(vb2_streamoff); int vb2_expbuf(struct vb2_queue *q, struct v4l2_exportbuffer *eb) { return vb2_core_expbuf(q, &eb->fd, eb->type, eb->index, eb->plane, eb->flags); } EXPORT_SYMBOL_GPL(vb2_expbuf); int vb2_queue_init(struct vb2_queue *q) { /* * Sanity check */ if (WARN_ON(!q) || WARN_ON(q->timestamp_flags & ~(V4L2_BUF_FLAG_TIMESTAMP_MASK | V4L2_BUF_FLAG_TSTAMP_SRC_MASK))) return -EINVAL; /* Warn that the driver should choose an appropriate timestamp type */ WARN_ON((q->timestamp_flags & V4L2_BUF_FLAG_TIMESTAMP_MASK) == V4L2_BUF_FLAG_TIMESTAMP_UNKNOWN); /* Warn that vb2_memory should match with v4l2_memory */ if (WARN_ON(VB2_MEMORY_MMAP != (int)V4L2_MEMORY_MMAP) || WARN_ON(VB2_MEMORY_USERPTR != (int)V4L2_MEMORY_USERPTR) || WARN_ON(VB2_MEMORY_DMABUF != (int)V4L2_MEMORY_DMABUF)) return -EINVAL; if (q->buf_struct_size == 0) q->buf_struct_size = sizeof(struct vb2_v4l2_buffer); q->buf_ops = &v4l2_buf_ops; q->is_multiplanar = V4L2_TYPE_IS_MULTIPLANAR(q->type); q->is_output = V4L2_TYPE_IS_OUTPUT(q->type); q->copy_timestamp = (q->timestamp_flags & V4L2_BUF_FLAG_TIMESTAMP_MASK) == V4L2_BUF_FLAG_TIMESTAMP_COPY; /* * For compatibility with vb1: if QBUF hasn't been called yet, then * return EPOLLERR as well. This only affects capture queues, output * queues will always initialize waiting_for_buffers to false. */ q->quirk_poll_must_check_waiting_for_buffers = true; return vb2_core_queue_init(q); } EXPORT_SYMBOL_GPL(vb2_queue_init); void vb2_queue_release(struct vb2_queue *q) { vb2_core_queue_release(q); } EXPORT_SYMBOL_GPL(vb2_queue_release); __poll_t vb2_poll(struct vb2_queue *q, struct file *file, poll_table *wait) { struct video_device *vfd = video_devdata(file); __poll_t req_events = poll_requested_events(wait); __poll_t res = 0; if (test_bit(V4L2_FL_USES_V4L2_FH, &vfd->flags)) { struct v4l2_fh *fh = file->private_data; if (v4l2_event_pending(fh)) res = EPOLLPRI; else if (req_events & EPOLLPRI) poll_wait(file, &fh->wait, wait); } return res | vb2_core_poll(q, file, wait); } EXPORT_SYMBOL_GPL(vb2_poll); /* * The following functions are not part of the vb2 core API, but are helper * functions that plug into struct v4l2_ioctl_ops, struct v4l2_file_operations * and struct vb2_ops. * They contain boilerplate code that most if not all drivers have to do * and so they simplify the driver code. */ /* The queue is busy if there is a owner and you are not that owner. */ static inline bool vb2_queue_is_busy(struct video_device *vdev, struct file *file) { return vdev->queue->owner && vdev->queue->owner != file->private_data; } /* vb2 ioctl helpers */ int vb2_ioctl_reqbufs(struct file *file, void *priv, struct v4l2_requestbuffers *p) { struct video_device *vdev = video_devdata(file); int res = vb2_verify_memory_type(vdev->queue, p->memory, p->type); if (res) return res; if (vb2_queue_is_busy(vdev, file)) return -EBUSY; res = vb2_core_reqbufs(vdev->queue, p->memory, &p->count); /* If count == 0, then the owner has released all buffers and he is no longer owner of the queue. Otherwise we have a new owner. */ if (res == 0) vdev->queue->owner = p->count ? file->private_data : NULL; return res; } EXPORT_SYMBOL_GPL(vb2_ioctl_reqbufs); int vb2_ioctl_create_bufs(struct file *file, void *priv, struct v4l2_create_buffers *p) { struct video_device *vdev = video_devdata(file); int res = vb2_verify_memory_type(vdev->queue, p->memory, p->format.type); p->index = vdev->queue->num_buffers; /* * If count == 0, then just check if memory and type are valid. * Any -EBUSY result from vb2_verify_memory_type can be mapped to 0. */ if (p->count == 0) return res != -EBUSY ? res : 0; if (res) return res; if (vb2_queue_is_busy(vdev, file)) return -EBUSY; res = vb2_create_bufs(vdev->queue, p); if (res == 0) vdev->queue->owner = file->private_data; return res; } EXPORT_SYMBOL_GPL(vb2_ioctl_create_bufs); int vb2_ioctl_prepare_buf(struct file *file, void *priv, struct v4l2_buffer *p) { struct video_device *vdev = video_devdata(file); if (vb2_queue_is_busy(vdev, file)) return -EBUSY; return vb2_prepare_buf(vdev->queue, p); } EXPORT_SYMBOL_GPL(vb2_ioctl_prepare_buf); int vb2_ioctl_querybuf(struct file *file, void *priv, struct v4l2_buffer *p) { struct video_device *vdev = video_devdata(file); /* No need to call vb2_queue_is_busy(), anyone can query buffers. */ return vb2_querybuf(vdev->queue, p); } EXPORT_SYMBOL_GPL(vb2_ioctl_querybuf); int vb2_ioctl_qbuf(struct file *file, void *priv, struct v4l2_buffer *p) { struct video_device *vdev = video_devdata(file); if (vb2_queue_is_busy(vdev, file)) return -EBUSY; return vb2_qbuf(vdev->queue, p); } EXPORT_SYMBOL_GPL(vb2_ioctl_qbuf); int vb2_ioctl_dqbuf(struct file *file, void *priv, struct v4l2_buffer *p) { struct video_device *vdev = video_devdata(file); if (vb2_queue_is_busy(vdev, file)) return -EBUSY; return vb2_dqbuf(vdev->queue, p, file->f_flags & O_NONBLOCK); } EXPORT_SYMBOL_GPL(vb2_ioctl_dqbuf); int vb2_ioctl_streamon(struct file *file, void *priv, enum v4l2_buf_type i) { struct video_device *vdev = video_devdata(file); if (vb2_queue_is_busy(vdev, file)) return -EBUSY; return vb2_streamon(vdev->queue, i); } EXPORT_SYMBOL_GPL(vb2_ioctl_streamon); int vb2_ioctl_streamoff(struct file *file, void *priv, enum v4l2_buf_type i) { struct video_device *vdev = video_devdata(file); if (vb2_queue_is_busy(vdev, file)) return -EBUSY; return vb2_streamoff(vdev->queue, i); } EXPORT_SYMBOL_GPL(vb2_ioctl_streamoff); int vb2_ioctl_expbuf(struct file *file, void *priv, struct v4l2_exportbuffer *p) { struct video_device *vdev = video_devdata(file); if (vb2_queue_is_busy(vdev, file)) return -EBUSY; return vb2_expbuf(vdev->queue, p); } EXPORT_SYMBOL_GPL(vb2_ioctl_expbuf); /* v4l2_file_operations helpers */ int vb2_fop_mmap(struct file *file, struct vm_area_struct *vma) { struct video_device *vdev = video_devdata(file); return vb2_mmap(vdev->queue, vma); } EXPORT_SYMBOL_GPL(vb2_fop_mmap); int _vb2_fop_release(struct file *file, struct mutex *lock) { struct video_device *vdev = video_devdata(file); if (lock) mutex_lock(lock); if (file->private_data == vdev->queue->owner) { vb2_queue_release(vdev->queue); vdev->queue->owner = NULL; } if (lock) mutex_unlock(lock); return v4l2_fh_release(file); } EXPORT_SYMBOL_GPL(_vb2_fop_release); int vb2_fop_release(struct file *file) { struct video_device *vdev = video_devdata(file); struct mutex *lock = vdev->queue->lock ? vdev->queue->lock : vdev->lock; return _vb2_fop_release(file, lock); } EXPORT_SYMBOL_GPL(vb2_fop_release); ssize_t vb2_fop_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct video_device *vdev = video_devdata(file); struct mutex *lock = vdev->queue->lock ? vdev->queue->lock : vdev->lock; int err = -EBUSY; if (!(vdev->queue->io_modes & VB2_WRITE)) return -EINVAL; if (lock && mutex_lock_interruptible(lock)) return -ERESTARTSYS; if (vb2_queue_is_busy(vdev, file)) goto exit; err = vb2_write(vdev->queue, buf, count, ppos, file->f_flags & O_NONBLOCK); if (vdev->queue->fileio) vdev->queue->owner = file->private_data; exit: if (lock) mutex_unlock(lock); return err; } EXPORT_SYMBOL_GPL(vb2_fop_write); ssize_t vb2_fop_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct video_device *vdev = video_devdata(file); struct mutex *lock = vdev->queue->lock ? vdev->queue->lock : vdev->lock; int err = -EBUSY; if (!(vdev->queue->io_modes & VB2_READ)) return -EINVAL; if (lock && mutex_lock_interruptible(lock)) return -ERESTARTSYS; if (vb2_queue_is_busy(vdev, file)) goto exit; err = vb2_read(vdev->queue, buf, count, ppos, file->f_flags & O_NONBLOCK); if (vdev->queue->fileio) vdev->queue->owner = file->private_data; exit: if (lock) mutex_unlock(lock); return err; } EXPORT_SYMBOL_GPL(vb2_fop_read); __poll_t vb2_fop_poll(struct file *file, poll_table *wait) { struct video_device *vdev = video_devdata(file); struct vb2_queue *q = vdev->queue; struct mutex *lock = q->lock ? q->lock : vdev->lock; __poll_t res; void *fileio; /* * If this helper doesn't know how to lock, then you shouldn't be using * it but you should write your own. */ WARN_ON(!lock); if (lock && mutex_lock_interruptible(lock)) return EPOLLERR; fileio = q->fileio; res = vb2_poll(vdev->queue, file, wait); /* If fileio was started, then we have a new queue owner. */ if (!fileio && q->fileio) q->owner = file->private_data; if (lock) mutex_unlock(lock); return res; } EXPORT_SYMBOL_GPL(vb2_fop_poll); #ifndef CONFIG_MMU unsigned long vb2_fop_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct video_device *vdev = video_devdata(file); return vb2_get_unmapped_area(vdev->queue, addr, len, pgoff, flags); } EXPORT_SYMBOL_GPL(vb2_fop_get_unmapped_area); #endif /* vb2_ops helpers. Only use if vq->lock is non-NULL. */ void vb2_ops_wait_prepare(struct vb2_queue *vq) { mutex_unlock(vq->lock); } EXPORT_SYMBOL_GPL(vb2_ops_wait_prepare); void vb2_ops_wait_finish(struct vb2_queue *vq) { mutex_lock(vq->lock); } EXPORT_SYMBOL_GPL(vb2_ops_wait_finish); MODULE_DESCRIPTION("Driver helper framework for Video for Linux 2"); MODULE_AUTHOR("Pawel Osciak <pawel@osciak.com>, Marek Szyprowski"); MODULE_LICENSE("GPL"); |
15 85 69 85 21 21 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 | /* * Copyright (C) International Business Machines Corp., 2000-2002 * Portions Copyright (C) Christoph Hellwig, 2001-2002 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _H_JFS_METAPAGE #define _H_JFS_METAPAGE #include <linux/pagemap.h> struct metapage { /* Common logsyncblk prefix (see jfs_logmgr.h) */ u16 xflag; u16 unused; lid_t lid; int lsn; struct list_head synclist; /* End of logsyncblk prefix */ unsigned long flag; /* See Below */ unsigned long count; /* Reference count */ void *data; /* Data pointer */ sector_t index; /* block address of page */ wait_queue_head_t wait; /* implementation */ struct page *page; struct super_block *sb; unsigned int logical_size; /* Journal management */ int clsn; int nohomeok; struct jfs_log *log; }; /* metapage flag */ #define META_locked 0 #define META_dirty 2 #define META_sync 3 #define META_discard 4 #define META_forcewrite 5 #define META_io 6 #define mark_metapage_dirty(mp) set_bit(META_dirty, &(mp)->flag) /* function prototypes */ extern int metapage_init(void); extern void metapage_exit(void); extern struct metapage *__get_metapage(struct inode *inode, unsigned long lblock, unsigned int size, int absolute, unsigned long new); #define read_metapage(inode, lblock, size, absolute)\ __get_metapage(inode, lblock, size, absolute, false) #define get_metapage(inode, lblock, size, absolute)\ __get_metapage(inode, lblock, size, absolute, true) extern void release_metapage(struct metapage *); extern void grab_metapage(struct metapage *); extern void force_metapage(struct metapage *); /* * hold_metapage and put_metapage are used in conjunction. The page lock * is not dropped between the two, so no other threads can get or release * the metapage */ extern void hold_metapage(struct metapage *); extern void put_metapage(struct metapage *); static inline void write_metapage(struct metapage *mp) { set_bit(META_dirty, &mp->flag); release_metapage(mp); } static inline void flush_metapage(struct metapage *mp) { set_bit(META_sync, &mp->flag); write_metapage(mp); } static inline void discard_metapage(struct metapage *mp) { clear_bit(META_dirty, &mp->flag); set_bit(META_discard, &mp->flag); release_metapage(mp); } static inline void metapage_nohomeok(struct metapage *mp) { struct page *page = mp->page; lock_page(page); if (!mp->nohomeok++) { mark_metapage_dirty(mp); get_page(page); wait_on_page_writeback(page); } unlock_page(page); } /* * This serializes access to mp->lsn when metapages are added to logsynclist * without setting nohomeok. i.e. updating imap & dmap */ static inline void metapage_wait_for_io(struct metapage *mp) { if (test_bit(META_io, &mp->flag)) wait_on_page_writeback(mp->page); } /* * This is called when already holding the metapage */ static inline void _metapage_homeok(struct metapage *mp) { if (!--mp->nohomeok) put_page(mp->page); } static inline void metapage_homeok(struct metapage *mp) { hold_metapage(mp); _metapage_homeok(mp); put_metapage(mp); } extern const struct address_space_operations jfs_metapage_aops; /* * This routines invalidate all pages for an extent. */ extern void __invalidate_metapages(struct inode *, s64, int); #define invalidate_pxd_metapages(ip, pxd) \ __invalidate_metapages((ip), addressPXD(&(pxd)), lengthPXD(&(pxd))) #define invalidate_dxd_metapages(ip, dxd) \ __invalidate_metapages((ip), addressDXD(&(dxd)), lengthDXD(&(dxd))) #define invalidate_xad_metapages(ip, xad) \ __invalidate_metapages((ip), addressXAD(&(xad)), lengthXAD(&(xad))) #endif /* _H_JFS_METAPAGE */ |
5 14 3 4 3 4 2 14 14 30 30 24 30 4 4 13 1 1 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 | /* * CIPSO - Commercial IP Security Option * * This is an implementation of the CIPSO 2.2 protocol as specified in * draft-ietf-cipso-ipsecurity-01.txt with additional tag types as found in * FIPS-188. While CIPSO never became a full IETF RFC standard many vendors * have chosen to adopt the protocol and over the years it has become a * de-facto standard for labeled networking. * * The CIPSO draft specification can be found in the kernel's Documentation * directory as well as the following URL: * http://tools.ietf.org/id/draft-ietf-cipso-ipsecurity-01.txt * The FIPS-188 specification can be found at the following URL: * http://www.itl.nist.gov/fipspubs/fip188.htm * * Author: Paul Moore <paul.moore@hp.com> * */ /* * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, see <http://www.gnu.org/licenses/>. * */ #include <linux/init.h> #include <linux/types.h> #include <linux/rcupdate.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/string.h> #include <linux/jhash.h> #include <linux/audit.h> #include <linux/slab.h> #include <net/ip.h> #include <net/icmp.h> #include <net/tcp.h> #include <net/netlabel.h> #include <net/cipso_ipv4.h> #include <linux/atomic.h> #include <linux/bug.h> #include <asm/unaligned.h> /* List of available DOI definitions */ /* XXX - This currently assumes a minimal number of different DOIs in use, * if in practice there are a lot of different DOIs this list should * probably be turned into a hash table or something similar so we * can do quick lookups. */ static DEFINE_SPINLOCK(cipso_v4_doi_list_lock); static LIST_HEAD(cipso_v4_doi_list); /* Label mapping cache */ int cipso_v4_cache_enabled = 1; int cipso_v4_cache_bucketsize = 10; #define CIPSO_V4_CACHE_BUCKETBITS 7 #define CIPSO_V4_CACHE_BUCKETS (1 << CIPSO_V4_CACHE_BUCKETBITS) #define CIPSO_V4_CACHE_REORDERLIMIT 10 struct cipso_v4_map_cache_bkt { spinlock_t lock; u32 size; struct list_head list; }; struct cipso_v4_map_cache_entry { u32 hash; unsigned char *key; size_t key_len; struct netlbl_lsm_cache *lsm_data; u32 activity; struct list_head list; }; static struct cipso_v4_map_cache_bkt *cipso_v4_cache; /* Restricted bitmap (tag #1) flags */ int cipso_v4_rbm_optfmt = 0; int cipso_v4_rbm_strictvalid = 1; /* * Protocol Constants */ /* Maximum size of the CIPSO IP option, derived from the fact that the maximum * IPv4 header size is 60 bytes and the base IPv4 header is 20 bytes long. */ #define CIPSO_V4_OPT_LEN_MAX 40 /* Length of the base CIPSO option, this includes the option type (1 byte), the * option length (1 byte), and the DOI (4 bytes). */ #define CIPSO_V4_HDR_LEN 6 /* Base length of the restrictive category bitmap tag (tag #1). */ #define CIPSO_V4_TAG_RBM_BLEN 4 /* Base length of the enumerated category tag (tag #2). */ #define CIPSO_V4_TAG_ENUM_BLEN 4 /* Base length of the ranged categories bitmap tag (tag #5). */ #define CIPSO_V4_TAG_RNG_BLEN 4 /* The maximum number of category ranges permitted in the ranged category tag * (tag #5). You may note that the IETF draft states that the maximum number * of category ranges is 7, but if the low end of the last category range is * zero then it is possible to fit 8 category ranges because the zero should * be omitted. */ #define CIPSO_V4_TAG_RNG_CAT_MAX 8 /* Base length of the local tag (non-standard tag). * Tag definition (may change between kernel versions) * * 0 8 16 24 32 * +----------+----------+----------+----------+ * | 10000000 | 00000110 | 32-bit secid value | * +----------+----------+----------+----------+ * | in (host byte order)| * +----------+----------+ * */ #define CIPSO_V4_TAG_LOC_BLEN 6 /* * Helper Functions */ /** * cipso_v4_cache_entry_free - Frees a cache entry * @entry: the entry to free * * Description: * This function frees the memory associated with a cache entry including the * LSM cache data if there are no longer any users, i.e. reference count == 0. * */ static void cipso_v4_cache_entry_free(struct cipso_v4_map_cache_entry *entry) { if (entry->lsm_data) netlbl_secattr_cache_free(entry->lsm_data); kfree(entry->key); kfree(entry); } /** * cipso_v4_map_cache_hash - Hashing function for the CIPSO cache * @key: the hash key * @key_len: the length of the key in bytes * * Description: * The CIPSO tag hashing function. Returns a 32-bit hash value. * */ static u32 cipso_v4_map_cache_hash(const unsigned char *key, u32 key_len) { return jhash(key, key_len, 0); } /* * Label Mapping Cache Functions */ /** * cipso_v4_cache_init - Initialize the CIPSO cache * * Description: * Initializes the CIPSO label mapping cache, this function should be called * before any of the other functions defined in this file. Returns zero on * success, negative values on error. * */ static int __init cipso_v4_cache_init(void) { u32 iter; cipso_v4_cache = kcalloc(CIPSO_V4_CACHE_BUCKETS, sizeof(struct cipso_v4_map_cache_bkt), GFP_KERNEL); if (!cipso_v4_cache) return -ENOMEM; for (iter = 0; iter < CIPSO_V4_CACHE_BUCKETS; iter++) { spin_lock_init(&cipso_v4_cache[iter].lock); cipso_v4_cache[iter].size = 0; INIT_LIST_HEAD(&cipso_v4_cache[iter].list); } return 0; } /** * cipso_v4_cache_invalidate - Invalidates the current CIPSO cache * * Description: * Invalidates and frees any entries in the CIPSO cache. Returns zero on * success and negative values on failure. * */ void cipso_v4_cache_invalidate(void) { struct cipso_v4_map_cache_entry *entry, *tmp_entry; u32 iter; for (iter = 0; iter < CIPSO_V4_CACHE_BUCKETS; iter++) { spin_lock_bh(&cipso_v4_cache[iter].lock); list_for_each_entry_safe(entry, tmp_entry, &cipso_v4_cache[iter].list, list) { list_del(&entry->list); cipso_v4_cache_entry_free(entry); } cipso_v4_cache[iter].size = 0; spin_unlock_bh(&cipso_v4_cache[iter].lock); } } /** * cipso_v4_cache_check - Check the CIPSO cache for a label mapping * @key: the buffer to check * @key_len: buffer length in bytes * @secattr: the security attribute struct to use * * Description: * This function checks the cache to see if a label mapping already exists for * the given key. If there is a match then the cache is adjusted and the * @secattr struct is populated with the correct LSM security attributes. The * cache is adjusted in the following manner if the entry is not already the * first in the cache bucket: * * 1. The cache entry's activity counter is incremented * 2. The previous (higher ranking) entry's activity counter is decremented * 3. If the difference between the two activity counters is geater than * CIPSO_V4_CACHE_REORDERLIMIT the two entries are swapped * * Returns zero on success, -ENOENT for a cache miss, and other negative values * on error. * */ static int cipso_v4_cache_check(const unsigned char *key, u32 key_len, struct netlbl_lsm_secattr *secattr) { u32 bkt; struct cipso_v4_map_cache_entry *entry; struct cipso_v4_map_cache_entry *prev_entry = NULL; u32 hash; if (!READ_ONCE(cipso_v4_cache_enabled)) return -ENOENT; hash = cipso_v4_map_cache_hash(key, key_len); bkt = hash & (CIPSO_V4_CACHE_BUCKETS - 1); spin_lock_bh(&cipso_v4_cache[bkt].lock); list_for_each_entry(entry, &cipso_v4_cache[bkt].list, list) { if (entry->hash == hash && entry->key_len == key_len && memcmp(entry->key, key, key_len) == 0) { entry->activity += 1; refcount_inc(&entry->lsm_data->refcount); secattr->cache = entry->lsm_data; secattr->flags |= NETLBL_SECATTR_CACHE; secattr->type = NETLBL_NLTYPE_CIPSOV4; if (!prev_entry) { spin_unlock_bh(&cipso_v4_cache[bkt].lock); return 0; } if (prev_entry->activity > 0) prev_entry->activity -= 1; if (entry->activity > prev_entry->activity && entry->activity - prev_entry->activity > CIPSO_V4_CACHE_REORDERLIMIT) { __list_del(entry->list.prev, entry->list.next); __list_add(&entry->list, prev_entry->list.prev, &prev_entry->list); } spin_unlock_bh(&cipso_v4_cache[bkt].lock); return 0; } prev_entry = entry; } spin_unlock_bh(&cipso_v4_cache[bkt].lock); return -ENOENT; } /** * cipso_v4_cache_add - Add an entry to the CIPSO cache * @skb: the packet * @secattr: the packet's security attributes * * Description: * Add a new entry into the CIPSO label mapping cache. Add the new entry to * head of the cache bucket's list, if the cache bucket is out of room remove * the last entry in the list first. It is important to note that there is * currently no checking for duplicate keys. Returns zero on success, * negative values on failure. * */ int cipso_v4_cache_add(const unsigned char *cipso_ptr, const struct netlbl_lsm_secattr *secattr) { int bkt_size = READ_ONCE(cipso_v4_cache_bucketsize); int ret_val = -EPERM; u32 bkt; struct cipso_v4_map_cache_entry *entry = NULL; struct cipso_v4_map_cache_entry *old_entry = NULL; u32 cipso_ptr_len; if (!READ_ONCE(cipso_v4_cache_enabled) || bkt_size <= 0) return 0; cipso_ptr_len = cipso_ptr[1]; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) return -ENOMEM; entry->key = kmemdup(cipso_ptr, cipso_ptr_len, GFP_ATOMIC); if (!entry->key) { ret_val = -ENOMEM; goto cache_add_failure; } entry->key_len = cipso_ptr_len; entry->hash = cipso_v4_map_cache_hash(cipso_ptr, cipso_ptr_len); refcount_inc(&secattr->cache->refcount); entry->lsm_data = secattr->cache; bkt = entry->hash & (CIPSO_V4_CACHE_BUCKETS - 1); spin_lock_bh(&cipso_v4_cache[bkt].lock); if (cipso_v4_cache[bkt].size < bkt_size) { list_add(&entry->list, &cipso_v4_cache[bkt].list); cipso_v4_cache[bkt].size += 1; } else { old_entry = list_entry(cipso_v4_cache[bkt].list.prev, struct cipso_v4_map_cache_entry, list); list_del(&old_entry->list); list_add(&entry->list, &cipso_v4_cache[bkt].list); cipso_v4_cache_entry_free(old_entry); } spin_unlock_bh(&cipso_v4_cache[bkt].lock); return 0; cache_add_failure: if (entry) cipso_v4_cache_entry_free(entry); return ret_val; } /* * DOI List Functions */ /** * cipso_v4_doi_search - Searches for a DOI definition * @doi: the DOI to search for * * Description: * Search the DOI definition list for a DOI definition with a DOI value that * matches @doi. The caller is responsible for calling rcu_read_[un]lock(). * Returns a pointer to the DOI definition on success and NULL on failure. */ static struct cipso_v4_doi *cipso_v4_doi_search(u32 doi) { struct cipso_v4_doi *iter; list_for_each_entry_rcu(iter, &cipso_v4_doi_list, list) if (iter->doi == doi && refcount_read(&iter->refcount)) return iter; return NULL; } /** * cipso_v4_doi_add - Add a new DOI to the CIPSO protocol engine * @doi_def: the DOI structure * @audit_info: NetLabel audit information * * Description: * The caller defines a new DOI for use by the CIPSO engine and calls this * function to add it to the list of acceptable domains. The caller must * ensure that the mapping table specified in @doi_def->map meets all of the * requirements of the mapping type (see cipso_ipv4.h for details). Returns * zero on success and non-zero on failure. * */ int cipso_v4_doi_add(struct cipso_v4_doi *doi_def, struct netlbl_audit *audit_info) { int ret_val = -EINVAL; u32 iter; u32 doi; u32 doi_type; struct audit_buffer *audit_buf; doi = doi_def->doi; doi_type = doi_def->type; if (doi_def->doi == CIPSO_V4_DOI_UNKNOWN) goto doi_add_return; for (iter = 0; iter < CIPSO_V4_TAG_MAXCNT; iter++) { switch (doi_def->tags[iter]) { case CIPSO_V4_TAG_RBITMAP: break; case CIPSO_V4_TAG_RANGE: case CIPSO_V4_TAG_ENUM: if (doi_def->type != CIPSO_V4_MAP_PASS) goto doi_add_return; break; case CIPSO_V4_TAG_LOCAL: if (doi_def->type != CIPSO_V4_MAP_LOCAL) goto doi_add_return; break; case CIPSO_V4_TAG_INVALID: if (iter == 0) goto doi_add_return; break; default: goto doi_add_return; } } refcount_set(&doi_def->refcount, 1); spin_lock(&cipso_v4_doi_list_lock); if (cipso_v4_doi_search(doi_def->doi)) { spin_unlock(&cipso_v4_doi_list_lock); ret_val = -EEXIST; goto doi_add_return; } list_add_tail_rcu(&doi_def->list, &cipso_v4_doi_list); spin_unlock(&cipso_v4_doi_list_lock); ret_val = 0; doi_add_return: audit_buf = netlbl_audit_start(AUDIT_MAC_CIPSOV4_ADD, audit_info); if (audit_buf) { const char *type_str; switch (doi_type) { case CIPSO_V4_MAP_TRANS: type_str = "trans"; break; case CIPSO_V4_MAP_PASS: type_str = "pass"; break; case CIPSO_V4_MAP_LOCAL: type_str = "local"; break; default: type_str = "(unknown)"; } audit_log_format(audit_buf, " cipso_doi=%u cipso_type=%s res=%u", doi, type_str, ret_val == 0 ? 1 : 0); audit_log_end(audit_buf); } return ret_val; } /** * cipso_v4_doi_free - Frees a DOI definition * @doi_def: the DOI definition * * Description: * This function frees all of the memory associated with a DOI definition. * */ void cipso_v4_doi_free(struct cipso_v4_doi *doi_def) { if (!doi_def) return; switch (doi_def->type) { case CIPSO_V4_MAP_TRANS: kfree(doi_def->map.std->lvl.cipso); kfree(doi_def->map.std->lvl.local); kfree(doi_def->map.std->cat.cipso); kfree(doi_def->map.std->cat.local); kfree(doi_def->map.std); break; } kfree(doi_def); } /** * cipso_v4_doi_free_rcu - Frees a DOI definition via the RCU pointer * @entry: the entry's RCU field * * Description: * This function is designed to be used as a callback to the call_rcu() * function so that the memory allocated to the DOI definition can be released * safely. * */ static void cipso_v4_doi_free_rcu(struct rcu_head *entry) { struct cipso_v4_doi *doi_def; doi_def = container_of(entry, struct cipso_v4_doi, rcu); cipso_v4_doi_free(doi_def); } /** * cipso_v4_doi_remove - Remove an existing DOI from the CIPSO protocol engine * @doi: the DOI value * @audit_secid: the LSM secid to use in the audit message * * Description: * Removes a DOI definition from the CIPSO engine. The NetLabel routines will * be called to release their own LSM domain mappings as well as our own * domain list. Returns zero on success and negative values on failure. * */ int cipso_v4_doi_remove(u32 doi, struct netlbl_audit *audit_info) { int ret_val; struct cipso_v4_doi *doi_def; struct audit_buffer *audit_buf; spin_lock(&cipso_v4_doi_list_lock); doi_def = cipso_v4_doi_search(doi); if (!doi_def) { spin_unlock(&cipso_v4_doi_list_lock); ret_val = -ENOENT; goto doi_remove_return; } list_del_rcu(&doi_def->list); spin_unlock(&cipso_v4_doi_list_lock); cipso_v4_doi_putdef(doi_def); ret_val = 0; doi_remove_return: audit_buf = netlbl_audit_start(AUDIT_MAC_CIPSOV4_DEL, audit_info); if (audit_buf) { audit_log_format(audit_buf, " cipso_doi=%u res=%u", doi, ret_val == 0 ? 1 : 0); audit_log_end(audit_buf); } return ret_val; } /** * cipso_v4_doi_getdef - Returns a reference to a valid DOI definition * @doi: the DOI value * * Description: * Searches for a valid DOI definition and if one is found it is returned to * the caller. Otherwise NULL is returned. The caller must ensure that * rcu_read_lock() is held while accessing the returned definition and the DOI * definition reference count is decremented when the caller is done. * */ struct cipso_v4_doi *cipso_v4_doi_getdef(u32 doi) { struct cipso_v4_doi *doi_def; rcu_read_lock(); doi_def = cipso_v4_doi_search(doi); if (!doi_def) goto doi_getdef_return; if (!refcount_inc_not_zero(&doi_def->refcount)) doi_def = NULL; doi_getdef_return: rcu_read_unlock(); return doi_def; } /** * cipso_v4_doi_putdef - Releases a reference for the given DOI definition * @doi_def: the DOI definition * * Description: * Releases a DOI definition reference obtained from cipso_v4_doi_getdef(). * */ void cipso_v4_doi_putdef(struct cipso_v4_doi *doi_def) { if (!doi_def) return; if (!refcount_dec_and_test(&doi_def->refcount)) return; cipso_v4_cache_invalidate(); call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu); } /** * cipso_v4_doi_walk - Iterate through the DOI definitions * @skip_cnt: skip past this number of DOI definitions, updated * @callback: callback for each DOI definition * @cb_arg: argument for the callback function * * Description: * Iterate over the DOI definition list, skipping the first @skip_cnt entries. * For each entry call @callback, if @callback returns a negative value stop * 'walking' through the list and return. Updates the value in @skip_cnt upon * return. Returns zero on success, negative values on failure. * */ int cipso_v4_doi_walk(u32 *skip_cnt, int (*callback) (struct cipso_v4_doi *doi_def, void *arg), void *cb_arg) { int ret_val = -ENOENT; u32 doi_cnt = 0; struct cipso_v4_doi *iter_doi; rcu_read_lock(); list_for_each_entry_rcu(iter_doi, &cipso_v4_doi_list, list) if (refcount_read(&iter_doi->refcount) > 0) { if (doi_cnt++ < *skip_cnt) continue; ret_val = callback(iter_doi, cb_arg); if (ret_val < 0) { doi_cnt--; goto doi_walk_return; } } doi_walk_return: rcu_read_unlock(); *skip_cnt = doi_cnt; return ret_val; } /* * Label Mapping Functions */ /** * cipso_v4_map_lvl_valid - Checks to see if the given level is understood * @doi_def: the DOI definition * @level: the level to check * * Description: * Checks the given level against the given DOI definition and returns a * negative value if the level does not have a valid mapping and a zero value * if the level is defined by the DOI. * */ static int cipso_v4_map_lvl_valid(const struct cipso_v4_doi *doi_def, u8 level) { switch (doi_def->type) { case CIPSO_V4_MAP_PASS: return 0; case CIPSO_V4_MAP_TRANS: if ((level < doi_def->map.std->lvl.cipso_size) && (doi_def->map.std->lvl.cipso[level] < CIPSO_V4_INV_LVL)) return 0; break; } return -EFAULT; } /** * cipso_v4_map_lvl_hton - Perform a level mapping from the host to the network * @doi_def: the DOI definition * @host_lvl: the host MLS level * @net_lvl: the network/CIPSO MLS level * * Description: * Perform a label mapping to translate a local MLS level to the correct * CIPSO level using the given DOI definition. Returns zero on success, * negative values otherwise. * */ static int cipso_v4_map_lvl_hton(const struct cipso_v4_doi *doi_def, u32 host_lvl, u32 *net_lvl) { switch (doi_def->type) { case CIPSO_V4_MAP_PASS: *net_lvl = host_lvl; return 0; case CIPSO_V4_MAP_TRANS: if (host_lvl < doi_def->map.std->lvl.local_size && doi_def->map.std->lvl.local[host_lvl] < CIPSO_V4_INV_LVL) { *net_lvl = doi_def->map.std->lvl.local[host_lvl]; return 0; } return -EPERM; } return -EINVAL; } /** * cipso_v4_map_lvl_ntoh - Perform a level mapping from the network to the host * @doi_def: the DOI definition * @net_lvl: the network/CIPSO MLS level * @host_lvl: the host MLS level * * Description: * Perform a label mapping to translate a CIPSO level to the correct local MLS * level using the given DOI definition. Returns zero on success, negative * values otherwise. * */ static int cipso_v4_map_lvl_ntoh(const struct cipso_v4_doi *doi_def, u32 net_lvl, u32 *host_lvl) { struct cipso_v4_std_map_tbl *map_tbl; switch (doi_def->type) { case CIPSO_V4_MAP_PASS: *host_lvl = net_lvl; return 0; case CIPSO_V4_MAP_TRANS: map_tbl = doi_def->map.std; if (net_lvl < map_tbl->lvl.cipso_size && map_tbl->lvl.cipso[net_lvl] < CIPSO_V4_INV_LVL) { *host_lvl = doi_def->map.std->lvl.cipso[net_lvl]; return 0; } return -EPERM; } return -EINVAL; } /** * cipso_v4_map_cat_rbm_valid - Checks to see if the category bitmap is valid * @doi_def: the DOI definition * @bitmap: category bitmap * @bitmap_len: bitmap length in bytes * * Description: * Checks the given category bitmap against the given DOI definition and * returns a negative value if any of the categories in the bitmap do not have * a valid mapping and a zero value if all of the categories are valid. * */ static int cipso_v4_map_cat_rbm_valid(const struct cipso_v4_doi *doi_def, const unsigned char *bitmap, u32 bitmap_len) { int cat = -1; u32 bitmap_len_bits = bitmap_len * 8; u32 cipso_cat_size; u32 *cipso_array; switch (doi_def->type) { case CIPSO_V4_MAP_PASS: return 0; case CIPSO_V4_MAP_TRANS: cipso_cat_size = doi_def->map.std->cat.cipso_size; cipso_array = doi_def->map.std->cat.cipso; for (;;) { cat = netlbl_bitmap_walk(bitmap, bitmap_len_bits, cat + 1, 1); if (cat < 0) break; if (cat >= cipso_cat_size || cipso_array[cat] >= CIPSO_V4_INV_CAT) return -EFAULT; } if (cat == -1) return 0; break; } return -EFAULT; } /** * cipso_v4_map_cat_rbm_hton - Perform a category mapping from host to network * @doi_def: the DOI definition * @secattr: the security attributes * @net_cat: the zero'd out category bitmap in network/CIPSO format * @net_cat_len: the length of the CIPSO bitmap in bytes * * Description: * Perform a label mapping to translate a local MLS category bitmap to the * correct CIPSO bitmap using the given DOI definition. Returns the minimum * size in bytes of the network bitmap on success, negative values otherwise. * */ static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr, unsigned char *net_cat, u32 net_cat_len) { int host_spot = -1; u32 net_spot = CIPSO_V4_INV_CAT; u32 net_spot_max = 0; u32 net_clen_bits = net_cat_len * 8; u32 host_cat_size = 0; u32 *host_cat_array = NULL; if (doi_def->type == CIPSO_V4_MAP_TRANS) { host_cat_size = doi_def->map.std->cat.local_size; host_cat_array = doi_def->map.std->cat.local; } for (;;) { host_spot = netlbl_catmap_walk(secattr->attr.mls.cat, host_spot + 1); if (host_spot < 0) break; switch (doi_def->type) { case CIPSO_V4_MAP_PASS: net_spot = host_spot; break; case CIPSO_V4_MAP_TRANS: if (host_spot >= host_cat_size) return -EPERM; net_spot = host_cat_array[host_spot]; if (net_spot >= CIPSO_V4_INV_CAT) return -EPERM; break; } if (net_spot >= net_clen_bits) return -ENOSPC; netlbl_bitmap_setbit(net_cat, net_spot, 1); if (net_spot > net_spot_max) net_spot_max = net_spot; } if (++net_spot_max % 8) return net_spot_max / 8 + 1; return net_spot_max / 8; } /** * cipso_v4_map_cat_rbm_ntoh - Perform a category mapping from network to host * @doi_def: the DOI definition * @net_cat: the category bitmap in network/CIPSO format * @net_cat_len: the length of the CIPSO bitmap in bytes * @secattr: the security attributes * * Description: * Perform a label mapping to translate a CIPSO bitmap to the correct local * MLS category bitmap using the given DOI definition. Returns zero on * success, negative values on failure. * */ static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def, const unsigned char *net_cat, u32 net_cat_len, struct netlbl_lsm_secattr *secattr) { int ret_val; int net_spot = -1; u32 host_spot = CIPSO_V4_INV_CAT; u32 net_clen_bits = net_cat_len * 8; u32 net_cat_size = 0; u32 *net_cat_array = NULL; if (doi_def->type == CIPSO_V4_MAP_TRANS) { net_cat_size = doi_def->map.std->cat.cipso_size; net_cat_array = doi_def->map.std->cat.cipso; } for (;;) { net_spot = netlbl_bitmap_walk(net_cat, net_clen_bits, net_spot + 1, 1); if (net_spot < 0) { if (net_spot == -2) return -EFAULT; return 0; } switch (doi_def->type) { case CIPSO_V4_MAP_PASS: host_spot = net_spot; break; case CIPSO_V4_MAP_TRANS: if (net_spot >= net_cat_size) return -EPERM; host_spot = net_cat_array[net_spot]; if (host_spot >= CIPSO_V4_INV_CAT) return -EPERM; break; } ret_val = netlbl_catmap_setbit(&secattr->attr.mls.cat, host_spot, GFP_ATOMIC); if (ret_val != 0) return ret_val; } return -EINVAL; } /** * cipso_v4_map_cat_enum_valid - Checks to see if the categories are valid * @doi_def: the DOI definition * @enumcat: category list * @enumcat_len: length of the category list in bytes * * Description: * Checks the given categories against the given DOI definition and returns a * negative value if any of the categories do not have a valid mapping and a * zero value if all of the categories are valid. * */ static int cipso_v4_map_cat_enum_valid(const struct cipso_v4_doi *doi_def, const unsigned char *enumcat, u32 enumcat_len) { u16 cat; int cat_prev = -1; u32 iter; if (doi_def->type != CIPSO_V4_MAP_PASS || enumcat_len & 0x01) return -EFAULT; for (iter = 0; iter < enumcat_len; iter += 2) { cat = get_unaligned_be16(&enumcat[iter]); if (cat <= cat_prev) return -EFAULT; cat_prev = cat; } return 0; } /** * cipso_v4_map_cat_enum_hton - Perform a category mapping from host to network * @doi_def: the DOI definition * @secattr: the security attributes * @net_cat: the zero'd out category list in network/CIPSO format * @net_cat_len: the length of the CIPSO category list in bytes * * Description: * Perform a label mapping to translate a local MLS category bitmap to the * correct CIPSO category list using the given DOI definition. Returns the * size in bytes of the network category bitmap on success, negative values * otherwise. * */ static int cipso_v4_map_cat_enum_hton(const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr, unsigned char *net_cat, u32 net_cat_len) { int cat = -1; u32 cat_iter = 0; for (;;) { cat = netlbl_catmap_walk(secattr->attr.mls.cat, cat + 1); if (cat < 0) break; if ((cat_iter + 2) > net_cat_len) return -ENOSPC; *((__be16 *)&net_cat[cat_iter]) = htons(cat); cat_iter += 2; } return cat_iter; } /** * cipso_v4_map_cat_enum_ntoh - Perform a category mapping from network to host * @doi_def: the DOI definition * @net_cat: the category list in network/CIPSO format * @net_cat_len: the length of the CIPSO bitmap in bytes * @secattr: the security attributes * * Description: * Perform a label mapping to translate a CIPSO category list to the correct * local MLS category bitmap using the given DOI definition. Returns zero on * success, negative values on failure. * */ static int cipso_v4_map_cat_enum_ntoh(const struct cipso_v4_doi *doi_def, const unsigned char *net_cat, u32 net_cat_len, struct netlbl_lsm_secattr *secattr) { int ret_val; u32 iter; for (iter = 0; iter < net_cat_len; iter += 2) { ret_val = netlbl_catmap_setbit(&secattr->attr.mls.cat, get_unaligned_be16(&net_cat[iter]), GFP_ATOMIC); if (ret_val != 0) return ret_val; } return 0; } /** * cipso_v4_map_cat_rng_valid - Checks to see if the categories are valid * @doi_def: the DOI definition * @rngcat: category list * @rngcat_len: length of the category list in bytes * * Description: * Checks the given categories against the given DOI definition and returns a * negative value if any of the categories do not have a valid mapping and a * zero value if all of the categories are valid. * */ static int cipso_v4_map_cat_rng_valid(const struct cipso_v4_doi *doi_def, const unsigned char *rngcat, u32 rngcat_len) { u16 cat_high; u16 cat_low; u32 cat_prev = CIPSO_V4_MAX_REM_CATS + 1; u32 iter; if (doi_def->type != CIPSO_V4_MAP_PASS || rngcat_len & 0x01) return -EFAULT; for (iter = 0; iter < rngcat_len; iter += 4) { cat_high = get_unaligned_be16(&rngcat[iter]); if ((iter + 4) <= rngcat_len) cat_low = get_unaligned_be16(&rngcat[iter + 2]); else cat_low = 0; if (cat_high > cat_prev) return -EFAULT; cat_prev = cat_low; } return 0; } /** * cipso_v4_map_cat_rng_hton - Perform a category mapping from host to network * @doi_def: the DOI definition * @secattr: the security attributes * @net_cat: the zero'd out category list in network/CIPSO format * @net_cat_len: the length of the CIPSO category list in bytes * * Description: * Perform a label mapping to translate a local MLS category bitmap to the * correct CIPSO category list using the given DOI definition. Returns the * size in bytes of the network category bitmap on success, negative values * otherwise. * */ static int cipso_v4_map_cat_rng_hton(const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr, unsigned char *net_cat, u32 net_cat_len) { int iter = -1; u16 array[CIPSO_V4_TAG_RNG_CAT_MAX * 2]; u32 array_cnt = 0; u32 cat_size = 0; /* make sure we don't overflow the 'array[]' variable */ if (net_cat_len > (CIPSO_V4_OPT_LEN_MAX - CIPSO_V4_HDR_LEN - CIPSO_V4_TAG_RNG_BLEN)) return -ENOSPC; for (;;) { iter = netlbl_catmap_walk(secattr->attr.mls.cat, iter + 1); if (iter < 0) break; cat_size += (iter == 0 ? 0 : sizeof(u16)); if (cat_size > net_cat_len) return -ENOSPC; array[array_cnt++] = iter; iter = netlbl_catmap_walkrng(secattr->attr.mls.cat, iter); if (iter < 0) return -EFAULT; cat_size += sizeof(u16); if (cat_size > net_cat_len) return -ENOSPC; array[array_cnt++] = iter; } for (iter = 0; array_cnt > 0;) { *((__be16 *)&net_cat[iter]) = htons(array[--array_cnt]); iter += 2; array_cnt--; if (array[array_cnt] != 0) { *((__be16 *)&net_cat[iter]) = htons(array[array_cnt]); iter += 2; } } return cat_size; } /** * cipso_v4_map_cat_rng_ntoh - Perform a category mapping from network to host * @doi_def: the DOI definition * @net_cat: the category list in network/CIPSO format * @net_cat_len: the length of the CIPSO bitmap in bytes * @secattr: the security attributes * * Description: * Perform a label mapping to translate a CIPSO category list to the correct * local MLS category bitmap using the given DOI definition. Returns zero on * success, negative values on failure. * */ static int cipso_v4_map_cat_rng_ntoh(const struct cipso_v4_doi *doi_def, const unsigned char *net_cat, u32 net_cat_len, struct netlbl_lsm_secattr *secattr) { int ret_val; u32 net_iter; u16 cat_low; u16 cat_high; for (net_iter = 0; net_iter < net_cat_len; net_iter += 4) { cat_high = get_unaligned_be16(&net_cat[net_iter]); if ((net_iter + 4) <= net_cat_len) cat_low = get_unaligned_be16(&net_cat[net_iter + 2]); else cat_low = 0; ret_val = netlbl_catmap_setrng(&secattr->attr.mls.cat, cat_low, cat_high, GFP_ATOMIC); if (ret_val != 0) return ret_val; } return 0; } /* * Protocol Handling Functions */ /** * cipso_v4_gentag_hdr - Generate a CIPSO option header * @doi_def: the DOI definition * @len: the total tag length in bytes, not including this header * @buf: the CIPSO option buffer * * Description: * Write a CIPSO header into the beginning of @buffer. * */ static void cipso_v4_gentag_hdr(const struct cipso_v4_doi *doi_def, unsigned char *buf, u32 len) { buf[0] = IPOPT_CIPSO; buf[1] = CIPSO_V4_HDR_LEN + len; *(__be32 *)&buf[2] = htonl(doi_def->doi); } /** * cipso_v4_gentag_rbm - Generate a CIPSO restricted bitmap tag (type #1) * @doi_def: the DOI definition * @secattr: the security attributes * @buffer: the option buffer * @buffer_len: length of buffer in bytes * * Description: * Generate a CIPSO option using the restricted bitmap tag, tag type #1. The * actual buffer length may be larger than the indicated size due to * translation between host and network category bitmaps. Returns the size of * the tag on success, negative values on failure. * */ static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr, unsigned char *buffer, u32 buffer_len) { int ret_val; u32 tag_len; u32 level; if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0) return -EPERM; ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->attr.mls.lvl, &level); if (ret_val != 0) return ret_val; if (secattr->flags & NETLBL_SECATTR_MLS_CAT) { ret_val = cipso_v4_map_cat_rbm_hton(doi_def, secattr, &buffer[4], buffer_len - 4); if (ret_val < 0) return ret_val; /* This will send packets using the "optimized" format when * possible as specified in section 3.4.2.6 of the * CIPSO draft. */ if (READ_ONCE(cipso_v4_rbm_optfmt) && ret_val > 0 && ret_val <= 10) tag_len = 14; else tag_len = 4 + ret_val; } else tag_len = 4; buffer[0] = CIPSO_V4_TAG_RBITMAP; buffer[1] = tag_len; buffer[3] = level; return tag_len; } /** * cipso_v4_parsetag_rbm - Parse a CIPSO restricted bitmap tag * @doi_def: the DOI definition * @tag: the CIPSO tag * @secattr: the security attributes * * Description: * Parse a CIPSO restricted bitmap tag (tag type #1) and return the security * attributes in @secattr. Return zero on success, negatives values on * failure. * */ static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def, const unsigned char *tag, struct netlbl_lsm_secattr *secattr) { int ret_val; u8 tag_len = tag[1]; u32 level; ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level); if (ret_val != 0) return ret_val; secattr->attr.mls.lvl = level; secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (tag_len > 4) { ret_val = cipso_v4_map_cat_rbm_ntoh(doi_def, &tag[4], tag_len - 4, secattr); if (ret_val != 0) { netlbl_catmap_free(secattr->attr.mls.cat); return ret_val; } if (secattr->attr.mls.cat) secattr->flags |= NETLBL_SECATTR_MLS_CAT; } return 0; } /** * cipso_v4_gentag_enum - Generate a CIPSO enumerated tag (type #2) * @doi_def: the DOI definition * @secattr: the security attributes * @buffer: the option buffer * @buffer_len: length of buffer in bytes * * Description: * Generate a CIPSO option using the enumerated tag, tag type #2. Returns the * size of the tag on success, negative values on failure. * */ static int cipso_v4_gentag_enum(const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr, unsigned char *buffer, u32 buffer_len) { int ret_val; u32 tag_len; u32 level; if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL)) return -EPERM; ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->attr.mls.lvl, &level); if (ret_val != 0) return ret_val; if (secattr->flags & NETLBL_SECATTR_MLS_CAT) { ret_val = cipso_v4_map_cat_enum_hton(doi_def, secattr, &buffer[4], buffer_len - 4); if (ret_val < 0) return ret_val; tag_len = 4 + ret_val; } else tag_len = 4; buffer[0] = CIPSO_V4_TAG_ENUM; buffer[1] = tag_len; buffer[3] = level; return tag_len; } /** * cipso_v4_parsetag_enum - Parse a CIPSO enumerated tag * @doi_def: the DOI definition * @tag: the CIPSO tag * @secattr: the security attributes * * Description: * Parse a CIPSO enumerated tag (tag type #2) and return the security * attributes in @secattr. Return zero on success, negatives values on * failure. * */ static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def, const unsigned char *tag, struct netlbl_lsm_secattr *secattr) { int ret_val; u8 tag_len = tag[1]; u32 level; ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level); if (ret_val != 0) return ret_val; secattr->attr.mls.lvl = level; secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (tag_len > 4) { ret_val = cipso_v4_map_cat_enum_ntoh(doi_def, &tag[4], tag_len - 4, secattr); if (ret_val != 0) { netlbl_catmap_free(secattr->attr.mls.cat); return ret_val; } secattr->flags |= NETLBL_SECATTR_MLS_CAT; } return 0; } /** * cipso_v4_gentag_rng - Generate a CIPSO ranged tag (type #5) * @doi_def: the DOI definition * @secattr: the security attributes * @buffer: the option buffer * @buffer_len: length of buffer in bytes * * Description: * Generate a CIPSO option using the ranged tag, tag type #5. Returns the * size of the tag on success, negative values on failure. * */ static int cipso_v4_gentag_rng(const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr, unsigned char *buffer, u32 buffer_len) { int ret_val; u32 tag_len; u32 level; if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL)) return -EPERM; ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->attr.mls.lvl, &level); if (ret_val != 0) return ret_val; if (secattr->flags & NETLBL_SECATTR_MLS_CAT) { ret_val = cipso_v4_map_cat_rng_hton(doi_def, secattr, &buffer[4], buffer_len - 4); if (ret_val < 0) return ret_val; tag_len = 4 + ret_val; } else tag_len = 4; buffer[0] = CIPSO_V4_TAG_RANGE; buffer[1] = tag_len; buffer[3] = level; return tag_len; } /** * cipso_v4_parsetag_rng - Parse a CIPSO ranged tag * @doi_def: the DOI definition * @tag: the CIPSO tag * @secattr: the security attributes * * Description: * Parse a CIPSO ranged tag (tag type #5) and return the security attributes * in @secattr. Return zero on success, negatives values on failure. * */ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def, const unsigned char *tag, struct netlbl_lsm_secattr *secattr) { int ret_val; u8 tag_len = tag[1]; u32 level; ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level); if (ret_val != 0) return ret_val; secattr->attr.mls.lvl = level; secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (tag_len > 4) { ret_val = cipso_v4_map_cat_rng_ntoh(doi_def, &tag[4], tag_len - 4, secattr); if (ret_val != 0) { netlbl_catmap_free(secattr->attr.mls.cat); return ret_val; } if (secattr->attr.mls.cat) secattr->flags |= NETLBL_SECATTR_MLS_CAT; } return 0; } /** * cipso_v4_gentag_loc - Generate a CIPSO local tag (non-standard) * @doi_def: the DOI definition * @secattr: the security attributes * @buffer: the option buffer * @buffer_len: length of buffer in bytes * * Description: * Generate a CIPSO option using the local tag. Returns the size of the tag * on success, negative values on failure. * */ static int cipso_v4_gentag_loc(const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr, unsigned char *buffer, u32 buffer_len) { if (!(secattr->flags & NETLBL_SECATTR_SECID)) return -EPERM; buffer[0] = CIPSO_V4_TAG_LOCAL; buffer[1] = CIPSO_V4_TAG_LOC_BLEN; *(u32 *)&buffer[2] = secattr->attr.secid; return CIPSO_V4_TAG_LOC_BLEN; } /** * cipso_v4_parsetag_loc - Parse a CIPSO local tag * @doi_def: the DOI definition * @tag: the CIPSO tag * @secattr: the security attributes * * Description: * Parse a CIPSO local tag and return the security attributes in @secattr. * Return zero on success, negatives values on failure. * */ static int cipso_v4_parsetag_loc(const struct cipso_v4_doi *doi_def, const unsigned char *tag, struct netlbl_lsm_secattr *secattr) { secattr->attr.secid = *(u32 *)&tag[2]; secattr->flags |= NETLBL_SECATTR_SECID; return 0; } /** * cipso_v4_optptr - Find the CIPSO option in the packet * @skb: the packet * * Description: * Parse the packet's IP header looking for a CIPSO option. Returns a pointer * to the start of the CIPSO option on success, NULL if one is not found. * */ unsigned char *cipso_v4_optptr(const struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); unsigned char *optptr = (unsigned char *)&(ip_hdr(skb)[1]); int optlen; int taglen; for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 1; ) { switch (optptr[0]) { case IPOPT_END: return NULL; case IPOPT_NOOP: taglen = 1; break; default: taglen = optptr[1]; } if (!taglen || taglen > optlen) return NULL; if (optptr[0] == IPOPT_CIPSO) return optptr; optlen -= taglen; optptr += taglen; } return NULL; } /** * cipso_v4_validate - Validate a CIPSO option * @option: the start of the option, on error it is set to point to the error * * Description: * This routine is called to validate a CIPSO option, it checks all of the * fields to ensure that they are at least valid, see the draft snippet below * for details. If the option is valid then a zero value is returned and * the value of @option is unchanged. If the option is invalid then a * non-zero value is returned and @option is adjusted to point to the * offending portion of the option. From the IETF draft ... * * "If any field within the CIPSO options, such as the DOI identifier, is not * recognized the IP datagram is discarded and an ICMP 'parameter problem' * (type 12) is generated and returned. The ICMP code field is set to 'bad * parameter' (code 0) and the pointer is set to the start of the CIPSO field * that is unrecognized." * */ int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option) { unsigned char *opt = *option; unsigned char *tag; unsigned char opt_iter; unsigned char err_offset = 0; u8 opt_len; u8 tag_len; struct cipso_v4_doi *doi_def = NULL; u32 tag_iter; /* caller already checks for length values that are too large */ opt_len = opt[1]; if (opt_len < 8) { err_offset = 1; goto validate_return; } rcu_read_lock(); doi_def = cipso_v4_doi_search(get_unaligned_be32(&opt[2])); if (!doi_def) { err_offset = 2; goto validate_return_locked; } opt_iter = CIPSO_V4_HDR_LEN; tag = opt + opt_iter; while (opt_iter < opt_len) { for (tag_iter = 0; doi_def->tags[tag_iter] != tag[0];) if (doi_def->tags[tag_iter] == CIPSO_V4_TAG_INVALID || ++tag_iter == CIPSO_V4_TAG_MAXCNT) { err_offset = opt_iter; goto validate_return_locked; } if (opt_iter + 1 == opt_len) { err_offset = opt_iter; goto validate_return_locked; } tag_len = tag[1]; if (tag_len > (opt_len - opt_iter)) { err_offset = opt_iter + 1; goto validate_return_locked; } switch (tag[0]) { case CIPSO_V4_TAG_RBITMAP: if (tag_len < CIPSO_V4_TAG_RBM_BLEN) { err_offset = opt_iter + 1; goto validate_return_locked; } /* We are already going to do all the verification * necessary at the socket layer so from our point of * view it is safe to turn these checks off (and less * work), however, the CIPSO draft says we should do * all the CIPSO validations here but it doesn't * really specify _exactly_ what we need to validate * ... so, just make it a sysctl tunable. */ if (READ_ONCE(cipso_v4_rbm_strictvalid)) { if (cipso_v4_map_lvl_valid(doi_def, tag[3]) < 0) { err_offset = opt_iter + 3; goto validate_return_locked; } if (tag_len > CIPSO_V4_TAG_RBM_BLEN && cipso_v4_map_cat_rbm_valid(doi_def, &tag[4], tag_len - 4) < 0) { err_offset = opt_iter + 4; goto validate_return_locked; } } break; case CIPSO_V4_TAG_ENUM: if (tag_len < CIPSO_V4_TAG_ENUM_BLEN) { err_offset = opt_iter + 1; goto validate_return_locked; } if (cipso_v4_map_lvl_valid(doi_def, tag[3]) < 0) { err_offset = opt_iter + 3; goto validate_return_locked; } if (tag_len > CIPSO_V4_TAG_ENUM_BLEN && cipso_v4_map_cat_enum_valid(doi_def, &tag[4], tag_len - 4) < 0) { err_offset = opt_iter + 4; goto validate_return_locked; } break; case CIPSO_V4_TAG_RANGE: if (tag_len < CIPSO_V4_TAG_RNG_BLEN) { err_offset = opt_iter + 1; goto validate_return_locked; } if (cipso_v4_map_lvl_valid(doi_def, tag[3]) < 0) { err_offset = opt_iter + 3; goto validate_return_locked; } if (tag_len > CIPSO_V4_TAG_RNG_BLEN && cipso_v4_map_cat_rng_valid(doi_def, &tag[4], tag_len - 4) < 0) { err_offset = opt_iter + 4; goto validate_return_locked; } break; case CIPSO_V4_TAG_LOCAL: /* This is a non-standard tag that we only allow for * local connections, so if the incoming interface is * not the loopback device drop the packet. Further, * there is no legitimate reason for setting this from * userspace so reject it if skb is NULL. */ if (!skb || !(skb->dev->flags & IFF_LOOPBACK)) { err_offset = opt_iter; goto validate_return_locked; } if (tag_len != CIPSO_V4_TAG_LOC_BLEN) { err_offset = opt_iter + 1; goto validate_return_locked; } break; default: err_offset = opt_iter; goto validate_return_locked; } tag += tag_len; opt_iter += tag_len; } validate_return_locked: rcu_read_unlock(); validate_return: *option = opt + err_offset; return err_offset; } /** * cipso_v4_error - Send the correct response for a bad packet * @skb: the packet * @error: the error code * @gateway: CIPSO gateway flag * * Description: * Based on the error code given in @error, send an ICMP error message back to * the originating host. From the IETF draft ... * * "If the contents of the CIPSO [option] are valid but the security label is * outside of the configured host or port label range, the datagram is * discarded and an ICMP 'destination unreachable' (type 3) is generated and * returned. The code field of the ICMP is set to 'communication with * destination network administratively prohibited' (code 9) or to * 'communication with destination host administratively prohibited' * (code 10). The value of the code is dependent on whether the originator * of the ICMP message is acting as a CIPSO host or a CIPSO gateway. The * recipient of the ICMP message MUST be able to handle either value. The * same procedure is performed if a CIPSO [option] can not be added to an * IP packet because it is too large to fit in the IP options area." * * "If the error is triggered by receipt of an ICMP message, the message is * discarded and no response is permitted (consistent with general ICMP * processing rules)." * */ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway) { unsigned char optbuf[sizeof(struct ip_options) + 40]; struct ip_options *opt = (struct ip_options *)optbuf; int res; if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES) return; /* * We might be called above the IP layer, * so we can not use icmp_send and IPCB here. */ memset(opt, 0, sizeof(struct ip_options)); opt->optlen = ip_hdr(skb)->ihl*4 - sizeof(struct iphdr); rcu_read_lock(); res = __ip_options_compile(dev_net(skb->dev), opt, skb, NULL); rcu_read_unlock(); if (res) return; if (gateway) __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, opt); else __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, opt); } /** * cipso_v4_genopt - Generate a CIPSO option * @buf: the option buffer * @buf_len: the size of opt_buf * @doi_def: the CIPSO DOI to use * @secattr: the security attributes * * Description: * Generate a CIPSO option using the DOI definition and security attributes * passed to the function. Returns the length of the option on success and * negative values on failure. * */ static int cipso_v4_genopt(unsigned char *buf, u32 buf_len, const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val; u32 iter; if (buf_len <= CIPSO_V4_HDR_LEN) return -ENOSPC; /* XXX - This code assumes only one tag per CIPSO option which isn't * really a good assumption to make but since we only support the MAC * tags right now it is a safe assumption. */ iter = 0; do { memset(buf, 0, buf_len); switch (doi_def->tags[iter]) { case CIPSO_V4_TAG_RBITMAP: ret_val = cipso_v4_gentag_rbm(doi_def, secattr, &buf[CIPSO_V4_HDR_LEN], buf_len - CIPSO_V4_HDR_LEN); break; case CIPSO_V4_TAG_ENUM: ret_val = cipso_v4_gentag_enum(doi_def, secattr, &buf[CIPSO_V4_HDR_LEN], buf_len - CIPSO_V4_HDR_LEN); break; case CIPSO_V4_TAG_RANGE: ret_val = cipso_v4_gentag_rng(doi_def, secattr, &buf[CIPSO_V4_HDR_LEN], buf_len - CIPSO_V4_HDR_LEN); break; case CIPSO_V4_TAG_LOCAL: ret_val = cipso_v4_gentag_loc(doi_def, secattr, &buf[CIPSO_V4_HDR_LEN], buf_len - CIPSO_V4_HDR_LEN); break; default: return -EPERM; } iter++; } while (ret_val < 0 && iter < CIPSO_V4_TAG_MAXCNT && doi_def->tags[iter] != CIPSO_V4_TAG_INVALID); if (ret_val < 0) return ret_val; cipso_v4_gentag_hdr(doi_def, buf, ret_val); return CIPSO_V4_HDR_LEN + ret_val; } /** * cipso_v4_sock_setattr - Add a CIPSO option to a socket * @sk: the socket * @doi_def: the CIPSO DOI to use * @secattr: the specific security attributes of the socket * * Description: * Set the CIPSO option on the given socket using the DOI definition and * security attributes passed to the function. This function requires * exclusive access to @sk, which means it either needs to be in the * process of being created or locked. Returns zero on success and negative * values on failure. * */ int cipso_v4_sock_setattr(struct sock *sk, const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val = -EPERM; unsigned char *buf = NULL; u32 buf_len; u32 opt_len; struct ip_options_rcu *old, *opt = NULL; struct inet_sock *sk_inet; struct inet_connection_sock *sk_conn; /* In the case of sock_create_lite(), the sock->sk field is not * defined yet but it is not a problem as the only users of these * "lite" PF_INET sockets are functions which do an accept() call * afterwards so we will label the socket as part of the accept(). */ if (!sk) return 0; /* We allocate the maximum CIPSO option size here so we are probably * being a little wasteful, but it makes our life _much_ easier later * on and after all we are only talking about 40 bytes. */ buf_len = CIPSO_V4_OPT_LEN_MAX; buf = kmalloc(buf_len, GFP_ATOMIC); if (!buf) { ret_val = -ENOMEM; goto socket_setattr_failure; } ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr); if (ret_val < 0) goto socket_setattr_failure; buf_len = ret_val; /* We can't use ip_options_get() directly because it makes a call to * ip_options_get_alloc() which allocates memory with GFP_KERNEL and * we won't always have CAP_NET_RAW even though we _always_ want to * set the IPOPT_CIPSO option. */ opt_len = (buf_len + 3) & ~3; opt = kzalloc(sizeof(*opt) + opt_len, GFP_ATOMIC); if (!opt) { ret_val = -ENOMEM; goto socket_setattr_failure; } memcpy(opt->opt.__data, buf, buf_len); opt->opt.optlen = opt_len; opt->opt.cipso = sizeof(struct iphdr); kfree(buf); buf = NULL; sk_inet = inet_sk(sk); old = rcu_dereference_protected(sk_inet->inet_opt, lockdep_sock_is_held(sk)); if (sk_inet->is_icsk) { sk_conn = inet_csk(sk); if (old) sk_conn->icsk_ext_hdr_len -= old->opt.optlen; sk_conn->icsk_ext_hdr_len += opt->opt.optlen; sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie); } rcu_assign_pointer(sk_inet->inet_opt, opt); if (old) kfree_rcu(old, rcu); return 0; socket_setattr_failure: kfree(buf); kfree(opt); return ret_val; } /** * cipso_v4_req_setattr - Add a CIPSO option to a connection request socket * @req: the connection request socket * @doi_def: the CIPSO DOI to use * @secattr: the specific security attributes of the socket * * Description: * Set the CIPSO option on the given socket using the DOI definition and * security attributes passed to the function. Returns zero on success and * negative values on failure. * */ int cipso_v4_req_setattr(struct request_sock *req, const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val = -EPERM; unsigned char *buf = NULL; u32 buf_len; u32 opt_len; struct ip_options_rcu *opt = NULL; struct inet_request_sock *req_inet; /* We allocate the maximum CIPSO option size here so we are probably * being a little wasteful, but it makes our life _much_ easier later * on and after all we are only talking about 40 bytes. */ buf_len = CIPSO_V4_OPT_LEN_MAX; buf = kmalloc(buf_len, GFP_ATOMIC); if (!buf) { ret_val = -ENOMEM; goto req_setattr_failure; } ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr); if (ret_val < 0) goto req_setattr_failure; buf_len = ret_val; /* We can't use ip_options_get() directly because it makes a call to * ip_options_get_alloc() which allocates memory with GFP_KERNEL and * we won't always have CAP_NET_RAW even though we _always_ want to * set the IPOPT_CIPSO option. */ opt_len = (buf_len + 3) & ~3; opt = kzalloc(sizeof(*opt) + opt_len, GFP_ATOMIC); if (!opt) { ret_val = -ENOMEM; goto req_setattr_failure; } memcpy(opt->opt.__data, buf, buf_len); opt->opt.optlen = opt_len; opt->opt.cipso = sizeof(struct iphdr); kfree(buf); buf = NULL; req_inet = inet_rsk(req); opt = xchg((__force struct ip_options_rcu **)&req_inet->ireq_opt, opt); if (opt) kfree_rcu(opt, rcu); return 0; req_setattr_failure: kfree(buf); kfree(opt); return ret_val; } /** * cipso_v4_delopt - Delete the CIPSO option from a set of IP options * @opt_ptr: IP option pointer * * Description: * Deletes the CIPSO IP option from a set of IP options and makes the necessary * adjustments to the IP option structure. Returns zero on success, negative * values on failure. * */ static int cipso_v4_delopt(struct ip_options_rcu __rcu **opt_ptr) { struct ip_options_rcu *opt = rcu_dereference_protected(*opt_ptr, 1); int hdr_delta = 0; if (!opt || opt->opt.cipso == 0) return 0; if (opt->opt.srr || opt->opt.rr || opt->opt.ts || opt->opt.router_alert) { u8 cipso_len; u8 cipso_off; unsigned char *cipso_ptr; int iter; int optlen_new; cipso_off = opt->opt.cipso - sizeof(struct iphdr); cipso_ptr = &opt->opt.__data[cipso_off]; cipso_len = cipso_ptr[1]; if (opt->opt.srr > opt->opt.cipso) opt->opt.srr -= cipso_len; if (opt->opt.rr > opt->opt.cipso) opt->opt.rr -= cipso_len; if (opt->opt.ts > opt->opt.cipso) opt->opt.ts -= cipso_len; if (opt->opt.router_alert > opt->opt.cipso) opt->opt.router_alert -= cipso_len; opt->opt.cipso = 0; memmove(cipso_ptr, cipso_ptr + cipso_len, opt->opt.optlen - cipso_off - cipso_len); /* determining the new total option length is tricky because of * the padding necessary, the only thing i can think to do at * this point is walk the options one-by-one, skipping the * padding at the end to determine the actual option size and * from there we can determine the new total option length */ iter = 0; optlen_new = 0; while (iter < opt->opt.optlen) if (opt->opt.__data[iter] != IPOPT_NOP) { iter += opt->opt.__data[iter + 1]; optlen_new = iter; } else iter++; hdr_delta = opt->opt.optlen; opt->opt.optlen = (optlen_new + 3) & ~3; hdr_delta -= opt->opt.optlen; } else { /* only the cipso option was present on the socket so we can * remove the entire option struct */ *opt_ptr = NULL; hdr_delta = opt->opt.optlen; kfree_rcu(opt, rcu); } return hdr_delta; } /** * cipso_v4_sock_delattr - Delete the CIPSO option from a socket * @sk: the socket * * Description: * Removes the CIPSO option from a socket, if present. * */ void cipso_v4_sock_delattr(struct sock *sk) { struct inet_sock *sk_inet; int hdr_delta; sk_inet = inet_sk(sk); hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt); if (sk_inet->is_icsk && hdr_delta > 0) { struct inet_connection_sock *sk_conn = inet_csk(sk); sk_conn->icsk_ext_hdr_len -= hdr_delta; sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie); } } /** * cipso_v4_req_delattr - Delete the CIPSO option from a request socket * @reg: the request socket * * Description: * Removes the CIPSO option from a request socket, if present. * */ void cipso_v4_req_delattr(struct request_sock *req) { cipso_v4_delopt(&inet_rsk(req)->ireq_opt); } /** * cipso_v4_getattr - Helper function for the cipso_v4_*_getattr functions * @cipso: the CIPSO v4 option * @secattr: the security attributes * * Description: * Inspect @cipso and return the security attributes in @secattr. Returns zero * on success and negative values on failure. * */ int cipso_v4_getattr(const unsigned char *cipso, struct netlbl_lsm_secattr *secattr) { int ret_val = -ENOMSG; u32 doi; struct cipso_v4_doi *doi_def; if (cipso_v4_cache_check(cipso, cipso[1], secattr) == 0) return 0; doi = get_unaligned_be32(&cipso[2]); rcu_read_lock(); doi_def = cipso_v4_doi_search(doi); if (!doi_def) goto getattr_return; /* XXX - This code assumes only one tag per CIPSO option which isn't * really a good assumption to make but since we only support the MAC * tags right now it is a safe assumption. */ switch (cipso[6]) { case CIPSO_V4_TAG_RBITMAP: ret_val = cipso_v4_parsetag_rbm(doi_def, &cipso[6], secattr); break; case CIPSO_V4_TAG_ENUM: ret_val = cipso_v4_parsetag_enum(doi_def, &cipso[6], secattr); break; case CIPSO_V4_TAG_RANGE: ret_val = cipso_v4_parsetag_rng(doi_def, &cipso[6], secattr); break; case CIPSO_V4_TAG_LOCAL: ret_val = cipso_v4_parsetag_loc(doi_def, &cipso[6], secattr); break; } if (ret_val == 0) secattr->type = NETLBL_NLTYPE_CIPSOV4; getattr_return: rcu_read_unlock(); return ret_val; } /** * cipso_v4_sock_getattr - Get the security attributes from a sock * @sk: the sock * @secattr: the security attributes * * Description: * Query @sk to see if there is a CIPSO option attached to the sock and if * there is return the CIPSO security attributes in @secattr. This function * requires that @sk be locked, or privately held, but it does not do any * locking itself. Returns zero on success and negative values on failure. * */ int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) { struct ip_options_rcu *opt; int res = -ENOMSG; rcu_read_lock(); opt = rcu_dereference(inet_sk(sk)->inet_opt); if (opt && opt->opt.cipso) res = cipso_v4_getattr(opt->opt.__data + opt->opt.cipso - sizeof(struct iphdr), secattr); rcu_read_unlock(); return res; } /** * cipso_v4_skbuff_setattr - Set the CIPSO option on a packet * @skb: the packet * @secattr: the security attributes * * Description: * Set the CIPSO option on the given packet based on the security attributes. * Returns a pointer to the IP header on success and NULL on failure. * */ int cipso_v4_skbuff_setattr(struct sk_buff *skb, const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val; struct iphdr *iph; struct ip_options *opt = &IPCB(skb)->opt; unsigned char buf[CIPSO_V4_OPT_LEN_MAX]; u32 buf_len = CIPSO_V4_OPT_LEN_MAX; u32 opt_len; int len_delta; ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr); if (ret_val < 0) return ret_val; buf_len = ret_val; opt_len = (buf_len + 3) & ~3; /* we overwrite any existing options to ensure that we have enough * room for the CIPSO option, the reason is that we _need_ to guarantee * that the security label is applied to the packet - we do the same * thing when using the socket options and it hasn't caused a problem, * if we need to we can always revisit this choice later */ len_delta = opt_len - opt->optlen; /* if we don't ensure enough headroom we could panic on the skb_push() * call below so make sure we have enough, we are also "mangling" the * packet so we should probably do a copy-on-write call anyway */ ret_val = skb_cow(skb, skb_headroom(skb) + len_delta); if (ret_val < 0) return ret_val; if (len_delta > 0) { /* we assume that the header + opt->optlen have already been * "pushed" in ip_options_build() or similar */ iph = ip_hdr(skb); skb_push(skb, len_delta); memmove((char *)iph - len_delta, iph, iph->ihl << 2); skb_reset_network_header(skb); iph = ip_hdr(skb); } else if (len_delta < 0) { iph = ip_hdr(skb); memset(iph + 1, IPOPT_NOP, opt->optlen); } else iph = ip_hdr(skb); if (opt->optlen > 0) memset(opt, 0, sizeof(*opt)); opt->optlen = opt_len; opt->cipso = sizeof(struct iphdr); opt->is_changed = 1; /* we have to do the following because we are being called from a * netfilter hook which means the packet already has had the header * fields populated and the checksum calculated - yes this means we * are doing more work than needed but we do it to keep the core * stack clean and tidy */ memcpy(iph + 1, buf, buf_len); if (opt_len > buf_len) memset((char *)(iph + 1) + buf_len, 0, opt_len - buf_len); if (len_delta != 0) { iph->ihl = 5 + (opt_len >> 2); iph->tot_len = htons(skb->len); } ip_send_check(iph); return 0; } /** * cipso_v4_skbuff_delattr - Delete any CIPSO options from a packet * @skb: the packet * * Description: * Removes any and all CIPSO options from the given packet. Returns zero on * success, negative values on failure. * */ int cipso_v4_skbuff_delattr(struct sk_buff *skb) { int ret_val; struct iphdr *iph; struct ip_options *opt = &IPCB(skb)->opt; unsigned char *cipso_ptr; if (opt->cipso == 0) return 0; /* since we are changing the packet we should make a copy */ ret_val = skb_cow(skb, skb_headroom(skb)); if (ret_val < 0) return ret_val; /* the easiest thing to do is just replace the cipso option with noop * options since we don't change the size of the packet, although we * still need to recalculate the checksum */ iph = ip_hdr(skb); cipso_ptr = (unsigned char *)iph + opt->cipso; memset(cipso_ptr, IPOPT_NOOP, cipso_ptr[1]); opt->cipso = 0; opt->is_changed = 1; ip_send_check(iph); return 0; } /* * Setup Functions */ /** * cipso_v4_init - Initialize the CIPSO module * * Description: * Initialize the CIPSO module and prepare it for use. Returns zero on success * and negative values on failure. * */ static int __init cipso_v4_init(void) { int ret_val; ret_val = cipso_v4_cache_init(); if (ret_val != 0) panic("Failed to initialize the CIPSO/IPv4 cache (%d)\n", ret_val); return 0; } subsys_initcall(cipso_v4_init); |
115 180 61 61 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 | /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001 Intel Corp. * * This file is part of the SCTP kernel implementation * * These are definitions needed by the state machine. * * This SCTP implementation is free software; * you can redistribute it and/or modify it under the terms of * the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * * This SCTP implementation is distributed in the hope that it * will be useful, but WITHOUT ANY WARRANTY; without even the implied * ************************ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with GNU CC; see the file COPYING. If not, see * <http://www.gnu.org/licenses/>. * * Please send any bug reports or fixes you make to the * email addresses: * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Xingang Guo <xingang.guo@intel.com> * Jon Grimm <jgrimm@us.ibm.com> * Dajiang Zhang <dajiang.zhang@nokia.com> * Sridhar Samudrala <sri@us.ibm.com> * Daisy Chang <daisyc@us.ibm.com> * Ardelle Fan <ardelle.fan@intel.com> * Kevin Gao <kevin.gao@intel.com> */ #include <linux/types.h> #include <linux/compiler.h> #include <linux/slab.h> #include <linux/in.h> #include <net/sctp/command.h> #include <net/sctp/sctp.h> #ifndef __sctp_sm_h__ #define __sctp_sm_h__ /* * Possible values for the disposition are: */ enum sctp_disposition { SCTP_DISPOSITION_DISCARD, /* No further processing. */ SCTP_DISPOSITION_CONSUME, /* Process return values normally. */ SCTP_DISPOSITION_NOMEM, /* We ran out of memory--recover. */ SCTP_DISPOSITION_DELETE_TCB, /* Close the association. */ SCTP_DISPOSITION_ABORT, /* Close the association NOW. */ SCTP_DISPOSITION_VIOLATION, /* The peer is misbehaving. */ SCTP_DISPOSITION_NOT_IMPL, /* This entry is not implemented. */ SCTP_DISPOSITION_ERROR, /* This is plain old user error. */ SCTP_DISPOSITION_BUG, /* This is a bug. */ }; typedef enum sctp_disposition (sctp_state_fn_t) ( struct net *net, const struct sctp_endpoint *ep, const struct sctp_association *asoc, const union sctp_subtype type, void *arg, struct sctp_cmd_seq *commands); typedef void (sctp_timer_event_t) (struct timer_list *); struct sctp_sm_table_entry { sctp_state_fn_t *fn; const char *name; }; /* A naming convention of "sctp_sf_xxx" applies to all the state functions * currently in use. */ /* Prototypes for generic state functions. */ sctp_state_fn_t sctp_sf_not_impl; sctp_state_fn_t sctp_sf_bug; /* Prototypes for gener timer state functions. */ sctp_state_fn_t sctp_sf_timer_ignore; /* Prototypes for chunk state functions. */ sctp_state_fn_t sctp_sf_do_9_1_abort; sctp_state_fn_t sctp_sf_cookie_wait_abort; sctp_state_fn_t sctp_sf_cookie_echoed_abort; sctp_state_fn_t sctp_sf_shutdown_pending_abort; sctp_state_fn_t sctp_sf_shutdown_sent_abort; sctp_state_fn_t sctp_sf_shutdown_ack_sent_abort; sctp_state_fn_t sctp_sf_do_5_1B_init; sctp_state_fn_t sctp_sf_do_5_1C_ack; sctp_state_fn_t sctp_sf_do_5_1D_ce; sctp_state_fn_t sctp_sf_do_5_1E_ca; sctp_state_fn_t sctp_sf_do_4_C; sctp_state_fn_t sctp_sf_eat_data_6_2; sctp_state_fn_t sctp_sf_eat_data_fast_4_4; sctp_state_fn_t sctp_sf_eat_sack_6_2; sctp_state_fn_t sctp_sf_operr_notify; sctp_state_fn_t sctp_sf_t1_init_timer_expire; sctp_state_fn_t sctp_sf_t1_cookie_timer_expire; sctp_state_fn_t sctp_sf_t2_timer_expire; sctp_state_fn_t sctp_sf_t4_timer_expire; sctp_state_fn_t sctp_sf_t5_timer_expire; sctp_state_fn_t sctp_sf_sendbeat_8_3; sctp_state_fn_t sctp_sf_beat_8_3; sctp_state_fn_t sctp_sf_backbeat_8_3; sctp_state_fn_t sctp_sf_do_9_2_final; sctp_state_fn_t sctp_sf_do_9_2_shutdown; sctp_state_fn_t sctp_sf_do_9_2_shut_ctsn; sctp_state_fn_t sctp_sf_do_ecn_cwr; sctp_state_fn_t sctp_sf_do_ecne; sctp_state_fn_t sctp_sf_ootb; sctp_state_fn_t sctp_sf_pdiscard; sctp_state_fn_t sctp_sf_violation; sctp_state_fn_t sctp_sf_discard_chunk; sctp_state_fn_t sctp_sf_do_5_2_1_siminit; sctp_state_fn_t sctp_sf_do_5_2_2_dupinit; sctp_state_fn_t sctp_sf_do_5_2_3_initack; sctp_state_fn_t sctp_sf_do_5_2_4_dupcook; sctp_state_fn_t sctp_sf_unk_chunk; sctp_state_fn_t sctp_sf_do_8_5_1_E_sa; sctp_state_fn_t sctp_sf_cookie_echoed_err; sctp_state_fn_t sctp_sf_do_asconf; sctp_state_fn_t sctp_sf_do_asconf_ack; sctp_state_fn_t sctp_sf_do_reconf; sctp_state_fn_t sctp_sf_do_9_2_reshutack; sctp_state_fn_t sctp_sf_eat_fwd_tsn; sctp_state_fn_t sctp_sf_eat_fwd_tsn_fast; sctp_state_fn_t sctp_sf_eat_auth; /* Prototypes for primitive event state functions. */ sctp_state_fn_t sctp_sf_do_prm_asoc; sctp_state_fn_t sctp_sf_do_prm_send; sctp_state_fn_t sctp_sf_do_9_2_prm_shutdown; sctp_state_fn_t sctp_sf_cookie_wait_prm_shutdown; sctp_state_fn_t sctp_sf_cookie_echoed_prm_shutdown; sctp_state_fn_t sctp_sf_do_9_1_prm_abort; sctp_state_fn_t sctp_sf_cookie_wait_prm_abort; sctp_state_fn_t sctp_sf_cookie_echoed_prm_abort; sctp_state_fn_t sctp_sf_shutdown_pending_prm_abort; sctp_state_fn_t sctp_sf_shutdown_sent_prm_abort; sctp_state_fn_t sctp_sf_shutdown_ack_sent_prm_abort; sctp_state_fn_t sctp_sf_error_closed; sctp_state_fn_t sctp_sf_error_shutdown; sctp_state_fn_t sctp_sf_ignore_primitive; sctp_state_fn_t sctp_sf_do_prm_requestheartbeat; sctp_state_fn_t sctp_sf_do_prm_asconf; sctp_state_fn_t sctp_sf_do_prm_reconf; /* Prototypes for other event state functions. */ sctp_state_fn_t sctp_sf_do_no_pending_tsn; sctp_state_fn_t sctp_sf_do_9_2_start_shutdown; sctp_state_fn_t sctp_sf_do_9_2_shutdown_ack; sctp_state_fn_t sctp_sf_ignore_other; sctp_state_fn_t sctp_sf_cookie_wait_icmp_abort; /* Prototypes for timeout event state functions. */ sctp_state_fn_t sctp_sf_do_6_3_3_rtx; sctp_state_fn_t sctp_sf_send_reconf; sctp_state_fn_t sctp_sf_do_6_2_sack; sctp_state_fn_t sctp_sf_autoclose_timer_expire; /* Prototypes for utility support functions. */ __u8 sctp_get_chunk_type(struct sctp_chunk *chunk); const struct sctp_sm_table_entry *sctp_sm_lookup_event( struct net *net, enum sctp_event event_type, enum sctp_state state, union sctp_subtype event_subtype); int sctp_chunk_iif(const struct sctp_chunk *); struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *, struct sctp_chunk *, gfp_t gfp); __u32 sctp_generate_verification_tag(void); void sctp_populate_tie_tags(__u8 *cookie, __u32 curTag, __u32 hisTag); /* Prototypes for chunk-building functions. */ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, const struct sctp_bind_addr *bp, gfp_t gfp, int vparam_len); struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk, const gfp_t gfp, const int unkparam_len); struct sctp_chunk *sctp_make_cookie_echo(const struct sctp_association *asoc, const struct sctp_chunk *chunk); struct sctp_chunk *sctp_make_cookie_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk); struct sctp_chunk *sctp_make_cwr(const struct sctp_association *asoc, const __u32 lowest_tsn, const struct sctp_chunk *chunk); struct sctp_chunk *sctp_make_idata(const struct sctp_association *asoc, __u8 flags, int paylen, gfp_t gfp); struct sctp_chunk *sctp_make_ifwdtsn(const struct sctp_association *asoc, __u32 new_cum_tsn, size_t nstreams, struct sctp_ifwdtsn_skip *skiplist); struct sctp_chunk *sctp_make_datafrag_empty(const struct sctp_association *asoc, const struct sctp_sndrcvinfo *sinfo, int len, __u8 flags, gfp_t gfp); struct sctp_chunk *sctp_make_ecne(const struct sctp_association *asoc, const __u32 lowest_tsn); struct sctp_chunk *sctp_make_sack(struct sctp_association *asoc); struct sctp_chunk *sctp_make_shutdown(const struct sctp_association *asoc, const struct sctp_chunk *chunk); struct sctp_chunk *sctp_make_shutdown_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk); struct sctp_chunk *sctp_make_shutdown_complete( const struct sctp_association *asoc, const struct sctp_chunk *chunk); int sctp_init_cause(struct sctp_chunk *chunk, __be16 cause, size_t paylen); struct sctp_chunk *sctp_make_abort(const struct sctp_association *asoc, const struct sctp_chunk *chunk, const size_t hint); struct sctp_chunk *sctp_make_abort_no_data(const struct sctp_association *asoc, const struct sctp_chunk *chunk, __u32 tsn); struct sctp_chunk *sctp_make_abort_user(const struct sctp_association *asoc, struct msghdr *msg, size_t msg_len); struct sctp_chunk *sctp_make_abort_violation( const struct sctp_association *asoc, const struct sctp_chunk *chunk, const __u8 *payload, const size_t paylen); struct sctp_chunk *sctp_make_violation_paramlen( const struct sctp_association *asoc, const struct sctp_chunk *chunk, struct sctp_paramhdr *param); struct sctp_chunk *sctp_make_violation_max_retrans( const struct sctp_association *asoc, const struct sctp_chunk *chunk); struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc, const struct sctp_transport *transport); struct sctp_chunk *sctp_make_heartbeat_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk, const void *payload, const size_t paylen); struct sctp_chunk *sctp_make_op_error(const struct sctp_association *asoc, const struct sctp_chunk *chunk, __be16 cause_code, const void *payload, size_t paylen, size_t reserve_tail); struct sctp_chunk *sctp_make_asconf_update_ip(struct sctp_association *asoc, union sctp_addr *laddr, struct sockaddr *addrs, int addrcnt, __be16 flags); struct sctp_chunk *sctp_make_asconf_set_prim(struct sctp_association *asoc, union sctp_addr *addr); bool sctp_verify_asconf(const struct sctp_association *asoc, struct sctp_chunk *chunk, bool addr_param_needed, struct sctp_paramhdr **errp); struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc, struct sctp_chunk *asconf); int sctp_process_asconf_ack(struct sctp_association *asoc, struct sctp_chunk *asconf_ack); struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc, __u32 new_cum_tsn, size_t nstreams, struct sctp_fwdtsn_skip *skiplist); struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc, __u16 key_id); struct sctp_chunk *sctp_make_strreset_req(const struct sctp_association *asoc, __u16 stream_num, __be16 *stream_list, bool out, bool in); struct sctp_chunk *sctp_make_strreset_tsnreq( const struct sctp_association *asoc); struct sctp_chunk *sctp_make_strreset_addstrm( const struct sctp_association *asoc, __u16 out, __u16 in); struct sctp_chunk *sctp_make_strreset_resp(const struct sctp_association *asoc, __u32 result, __u32 sn); struct sctp_chunk *sctp_make_strreset_tsnresp(struct sctp_association *asoc, __u32 result, __u32 sn, __u32 sender_tsn, __u32 receiver_tsn); bool sctp_verify_reconf(const struct sctp_association *asoc, struct sctp_chunk *chunk, struct sctp_paramhdr **errp); void sctp_chunk_assign_tsn(struct sctp_chunk *chunk); void sctp_chunk_assign_ssn(struct sctp_chunk *chunk); /* Prototypes for stream-processing functions. */ struct sctp_chunk *sctp_process_strreset_outreq( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp); struct sctp_chunk *sctp_process_strreset_inreq( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp); struct sctp_chunk *sctp_process_strreset_tsnreq( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp); struct sctp_chunk *sctp_process_strreset_addstrm_out( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp); struct sctp_chunk *sctp_process_strreset_addstrm_in( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp); struct sctp_chunk *sctp_process_strreset_resp( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp); /* Prototypes for statetable processing. */ int sctp_do_sm(struct net *net, enum sctp_event event_type, union sctp_subtype subtype, enum sctp_state state, struct sctp_endpoint *ep, struct sctp_association *asoc, void *event_arg, gfp_t gfp); /* 2nd level prototypes */ void sctp_generate_t3_rtx_event(struct timer_list *t); void sctp_generate_heartbeat_event(struct timer_list *t); void sctp_generate_reconf_event(struct timer_list *t); void sctp_generate_proto_unreach_event(struct timer_list *t); void sctp_ootb_pkt_free(struct sctp_packet *packet); struct sctp_association *sctp_unpack_cookie( const struct sctp_endpoint *ep, const struct sctp_association *asoc, struct sctp_chunk *chunk, gfp_t gfp, int *err, struct sctp_chunk **err_chk_p); /* 3rd level prototypes */ __u32 sctp_generate_tag(const struct sctp_endpoint *ep); __u32 sctp_generate_tsn(const struct sctp_endpoint *ep); /* Extern declarations for major data structures. */ extern sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES]; /* Get the size of a DATA chunk payload. */ static inline __u16 sctp_data_size(struct sctp_chunk *chunk) { __u16 size; size = ntohs(chunk->chunk_hdr->length); size -= sctp_datachk_len(&chunk->asoc->stream); return size; } /* Compare two TSNs */ #define TSN_lt(a,b) \ (typecheck(__u32, a) && \ typecheck(__u32, b) && \ ((__s32)((a) - (b)) < 0)) #define TSN_lte(a,b) \ (typecheck(__u32, a) && \ typecheck(__u32, b) && \ ((__s32)((a) - (b)) <= 0)) /* Compare two MIDs */ #define MID_lt(a, b) \ (typecheck(__u32, a) && \ typecheck(__u32, b) && \ ((__s32)((a) - (b)) < 0)) /* Compare two SSNs */ #define SSN_lt(a,b) \ (typecheck(__u16, a) && \ typecheck(__u16, b) && \ ((__s16)((a) - (b)) < 0)) /* ADDIP 3.1.1 */ #define ADDIP_SERIAL_gte(a,b) \ (typecheck(__u32, a) && \ typecheck(__u32, b) && \ ((__s32)((b) - (a)) <= 0)) /* Check VTAG of the packet matches the sender's own tag. */ static inline int sctp_vtag_verify(const struct sctp_chunk *chunk, const struct sctp_association *asoc) { /* RFC 2960 Sec 8.5 When receiving an SCTP packet, the endpoint * MUST ensure that the value in the Verification Tag field of * the received SCTP packet matches its own Tag. If the received * Verification Tag value does not match the receiver's own * tag value, the receiver shall silently discard the packet... */ if (ntohl(chunk->sctp_hdr->vtag) == asoc->c.my_vtag) return 1; return 0; } /* Check VTAG of the packet matches the sender's own tag and the T bit is * not set, OR its peer's tag and the T bit is set in the Chunk Flags. */ static inline int sctp_vtag_verify_either(const struct sctp_chunk *chunk, const struct sctp_association *asoc) { /* RFC 2960 Section 8.5.1, sctpimpguide Section 2.41 * * B) The receiver of a ABORT MUST accept the packet * if the Verification Tag field of the packet matches its own tag * and the T bit is not set * OR * it is set to its peer's tag and the T bit is set in the Chunk * Flags. * Otherwise, the receiver MUST silently discard the packet * and take no further action. * * C) The receiver of a SHUTDOWN COMPLETE shall accept the packet * if the Verification Tag field of the packet matches its own tag * and the T bit is not set * OR * it is set to its peer's tag and the T bit is set in the Chunk * Flags. * Otherwise, the receiver MUST silently discard the packet * and take no further action. An endpoint MUST ignore the * SHUTDOWN COMPLETE if it is not in the SHUTDOWN-ACK-SENT state. */ if ((!sctp_test_T_bit(chunk) && (ntohl(chunk->sctp_hdr->vtag) == asoc->c.my_vtag)) || (sctp_test_T_bit(chunk) && asoc->c.peer_vtag && (ntohl(chunk->sctp_hdr->vtag) == asoc->c.peer_vtag))) { return 1; } return 0; } #endif /* __sctp_sm_h__ */ |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | /* SPDX-License-Identifier: GPL-2.0 */ /* * connection tracking expectations. */ #ifndef _NF_CONNTRACK_EXPECT_H #define _NF_CONNTRACK_EXPECT_H #include <linux/refcount.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_zones.h> extern unsigned int nf_ct_expect_hsize; extern unsigned int nf_ct_expect_max; extern struct hlist_head *nf_ct_expect_hash; struct nf_conntrack_expect { /* Conntrack expectation list member */ struct hlist_node lnode; /* Hash member */ struct hlist_node hnode; /* We expect this tuple, with the following mask */ struct nf_conntrack_tuple tuple; struct nf_conntrack_tuple_mask mask; /* Function to call after setup and insertion */ void (*expectfn)(struct nf_conn *new, struct nf_conntrack_expect *this); /* Helper to assign to new connection */ struct nf_conntrack_helper *helper; /* The conntrack of the master connection */ struct nf_conn *master; /* Timer function; deletes the expectation. */ struct timer_list timeout; /* Usage count. */ refcount_t use; /* Flags */ unsigned int flags; /* Expectation class */ unsigned int class; #ifdef CONFIG_NF_NAT_NEEDED union nf_inet_addr saved_addr; /* This is the original per-proto part, used to map the * expected connection the way the recipient expects. */ union nf_conntrack_man_proto saved_proto; /* Direction relative to the master connection. */ enum ip_conntrack_dir dir; #endif struct rcu_head rcu; }; static inline struct net *nf_ct_exp_net(struct nf_conntrack_expect *exp) { return nf_ct_net(exp->master); } #define NF_CT_EXP_POLICY_NAME_LEN 16 struct nf_conntrack_expect_policy { unsigned int max_expected; unsigned int timeout; char name[NF_CT_EXP_POLICY_NAME_LEN]; }; #define NF_CT_EXPECT_CLASS_DEFAULT 0 #define NF_CT_EXPECT_MAX_CNT 255 int nf_conntrack_expect_pernet_init(struct net *net); void nf_conntrack_expect_pernet_fini(struct net *net); int nf_conntrack_expect_init(void); void nf_conntrack_expect_fini(void); struct nf_conntrack_expect * __nf_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple); struct nf_conntrack_expect * nf_ct_expect_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple); struct nf_conntrack_expect * nf_ct_find_expectation(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple); void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, u32 portid, int report); static inline void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) { nf_ct_unlink_expect_report(exp, 0, 0); } void nf_ct_remove_expectations(struct nf_conn *ct); void nf_ct_unexpect_related(struct nf_conntrack_expect *exp); bool nf_ct_remove_expect(struct nf_conntrack_expect *exp); void nf_ct_expect_iterate_destroy(bool (*iter)(struct nf_conntrack_expect *e, void *data), void *data); void nf_ct_expect_iterate_net(struct net *net, bool (*iter)(struct nf_conntrack_expect *e, void *data), void *data, u32 portid, int report); /* Allocate space for an expectation: this is mandatory before calling nf_ct_expect_related. You will have to call put afterwards. */ struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me); void nf_ct_expect_init(struct nf_conntrack_expect *, unsigned int, u_int8_t, const union nf_inet_addr *, const union nf_inet_addr *, u_int8_t, const __be16 *, const __be16 *); void nf_ct_expect_put(struct nf_conntrack_expect *exp); int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, u32 portid, int report); static inline int nf_ct_expect_related(struct nf_conntrack_expect *expect) { return nf_ct_expect_related_report(expect, 0, 0); } #endif /*_NF_CONNTRACK_EXPECT_H*/ |
30 21 12 30 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 | /* * Detect Hung Task * * kernel/hung_task.c - kernel thread for detecting tasks stuck in D state * */ #include <linux/mm.h> #include <linux/cpu.h> #include <linux/nmi.h> #include <linux/init.h> #include <linux/delay.h> #include <linux/freezer.h> #include <linux/kthread.h> #include <linux/lockdep.h> #include <linux/export.h> #include <linux/sysctl.h> #include <linux/suspend.h> #include <linux/utsname.h> #include <linux/sched/signal.h> #include <linux/sched/debug.h> #include <trace/events/sched.h> /* * The number of tasks checked: */ int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; /* * Limit number of tasks checked in a batch. * * This value controls the preemptibility of khungtaskd since preemption * is disabled during the critical section. It also controls the size of * the RCU grace period. So it needs to be upper-bound. */ #define HUNG_TASK_LOCK_BREAK (HZ / 10) /* * Zero means infinite timeout - no checking done: */ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT; /* * Zero (default value) means use sysctl_hung_task_timeout_secs: */ unsigned long __read_mostly sysctl_hung_task_check_interval_secs; int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; static bool hung_task_show_lock; static bool hung_task_call_panic; static struct task_struct *watchdog_task; /* * Should we panic (and reboot, if panic_timeout= is set) when a * hung task is detected: */ unsigned int __read_mostly sysctl_hung_task_panic = CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE; static int __init hung_task_panic_setup(char *str) { int rc = kstrtouint(str, 0, &sysctl_hung_task_panic); if (rc) return rc; return 1; } __setup("hung_task_panic=", hung_task_panic_setup); static int hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr) { did_panic = 1; return NOTIFY_DONE; } static struct notifier_block panic_block = { .notifier_call = hung_task_panic, }; static void check_hung_task(struct task_struct *t, unsigned long timeout) { unsigned long switch_count = t->nvcsw + t->nivcsw; /* * Ensure the task is not frozen. * Also, skip vfork and any other user process that freezer should skip. */ if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP))) return; /* * When a freshly created task is scheduled once, changes its state to * TASK_UNINTERRUPTIBLE without having ever been switched out once, it * musn't be checked. */ if (unlikely(!switch_count)) return; if (switch_count != t->last_switch_count) { t->last_switch_count = switch_count; t->last_switch_time = jiffies; return; } if (time_is_after_jiffies(t->last_switch_time + timeout * HZ)) return; trace_sched_process_hang(t); if (sysctl_hung_task_panic) { console_verbose(); hung_task_show_lock = true; hung_task_call_panic = true; } /* * Ok, the task did not get scheduled for more than 2 minutes, * complain: */ if (sysctl_hung_task_warnings) { if (sysctl_hung_task_warnings > 0) sysctl_hung_task_warnings--; pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", t->comm, t->pid, timeout); pr_err(" %s %s %.*s\n", print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" " disables this message.\n"); sched_show_task(t); hung_task_show_lock = true; } touch_nmi_watchdog(); } /* * To avoid extending the RCU grace period for an unbounded amount of time, * periodically exit the critical section and enter a new one. * * For preemptible RCU it is sufficient to call rcu_read_unlock in order * to exit the grace period. For classic RCU, a reschedule is required. */ static bool rcu_lock_break(struct task_struct *g, struct task_struct *t) { bool can_cont; get_task_struct(g); get_task_struct(t); rcu_read_unlock(); cond_resched(); rcu_read_lock(); can_cont = pid_alive(g) && pid_alive(t); put_task_struct(t); put_task_struct(g); return can_cont; } /* * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for * a really long time (120 seconds). If that happens, print out * a warning. */ static void check_hung_uninterruptible_tasks(unsigned long timeout) { int max_count = sysctl_hung_task_check_count; unsigned long last_break = jiffies; struct task_struct *g, *t; /* * If the system crashed already then all bets are off, * do not report extra hung tasks: */ if (test_taint(TAINT_DIE) || did_panic) return; hung_task_show_lock = false; rcu_read_lock(); for_each_process_thread(g, t) { if (!max_count--) goto unlock; if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) { if (!rcu_lock_break(g, t)) goto unlock; last_break = jiffies; } /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ if (t->state == TASK_UNINTERRUPTIBLE) check_hung_task(t, timeout); } unlock: rcu_read_unlock(); if (hung_task_show_lock) debug_show_all_locks(); if (hung_task_call_panic) { trigger_all_cpu_backtrace(); panic("hung_task: blocked tasks"); } } static long hung_timeout_jiffies(unsigned long last_checked, unsigned long timeout) { /* timeout of 0 will disable the watchdog */ return timeout ? last_checked - jiffies + timeout * HZ : MAX_SCHEDULE_TIMEOUT; } /* * Process updating of timeout sysctl */ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) goto out; wake_up_process(watchdog_task); out: return ret; } static atomic_t reset_hung_task = ATOMIC_INIT(0); void reset_hung_task_detector(void) { atomic_set(&reset_hung_task, 1); } EXPORT_SYMBOL_GPL(reset_hung_task_detector); static bool hung_detector_suspended; static int hungtask_pm_notify(struct notifier_block *self, unsigned long action, void *hcpu) { switch (action) { case PM_SUSPEND_PREPARE: case PM_HIBERNATION_PREPARE: case PM_RESTORE_PREPARE: hung_detector_suspended = true; break; case PM_POST_SUSPEND: case PM_POST_HIBERNATION: case PM_POST_RESTORE: hung_detector_suspended = false; break; default: break; } return NOTIFY_OK; } /* * kthread which checks for tasks stuck in D state */ static int watchdog(void *dummy) { unsigned long hung_last_checked = jiffies; set_user_nice(current, 0); for ( ; ; ) { unsigned long timeout = sysctl_hung_task_timeout_secs; unsigned long interval = sysctl_hung_task_check_interval_secs; long t; if (interval == 0) interval = timeout; interval = min_t(unsigned long, interval, timeout); t = hung_timeout_jiffies(hung_last_checked, interval); if (t <= 0) { if (!atomic_xchg(&reset_hung_task, 0) && !hung_detector_suspended) check_hung_uninterruptible_tasks(timeout); hung_last_checked = jiffies; continue; } schedule_timeout_interruptible(t); } return 0; } static int __init hung_task_init(void) { atomic_notifier_chain_register(&panic_notifier_list, &panic_block); /* Disable hung task detector on suspend */ pm_notifier(hungtask_pm_notify, 0); watchdog_task = kthread_run(watchdog, NULL, "khungtaskd"); return 0; } subsys_initcall(hung_task_init); |
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 | // SPDX-License-Identifier: GPL-2.0 #include <linux/cpumask.h> #include <linux/fs.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/proc_fs.h> #include <linux/sched.h> #include <linux/sched/stat.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/time.h> #include <linux/irqnr.h> #include <linux/sched/cputime.h> #include <linux/tick.h> #ifndef arch_irq_stat_cpu #define arch_irq_stat_cpu(cpu) 0 #endif #ifndef arch_irq_stat #define arch_irq_stat() 0 #endif #ifdef arch_idle_time static u64 get_idle_time(int cpu) { u64 idle; idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; if (cpu_online(cpu) && !nr_iowait_cpu(cpu)) idle += arch_idle_time(cpu); return idle; } static u64 get_iowait_time(int cpu) { u64 iowait; iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; if (cpu_online(cpu) && nr_iowait_cpu(cpu)) iowait += arch_idle_time(cpu); return iowait; } #else static u64 get_idle_time(int cpu) { u64 idle, idle_usecs = -1ULL; if (cpu_online(cpu)) idle_usecs = get_cpu_idle_time_us(cpu, NULL); if (idle_usecs == -1ULL) /* !NO_HZ or cpu offline so we can rely on cpustat.idle */ idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; else idle = idle_usecs * NSEC_PER_USEC; return idle; } static u64 get_iowait_time(int cpu) { u64 iowait, iowait_usecs = -1ULL; if (cpu_online(cpu)) iowait_usecs = get_cpu_iowait_time_us(cpu, NULL); if (iowait_usecs == -1ULL) /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */ iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; else iowait = iowait_usecs * NSEC_PER_USEC; return iowait; } #endif static int show_stat(struct seq_file *p, void *v) { int i, j; u64 user, nice, system, idle, iowait, irq, softirq, steal; u64 guest, guest_nice; u64 sum = 0; u64 sum_softirq = 0; unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; struct timespec64 boottime; user = nice = system = idle = iowait = irq = softirq = steal = 0; guest = guest_nice = 0; getboottime64(&boottime); for_each_possible_cpu(i) { user += kcpustat_cpu(i).cpustat[CPUTIME_USER]; nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE]; system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; idle += get_idle_time(i); iowait += get_iowait_time(i); irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; sum += kstat_cpu_irqs_sum(i); sum += arch_irq_stat_cpu(i); for (j = 0; j < NR_SOFTIRQS; j++) { unsigned int softirq_stat = kstat_softirqs_cpu(j, i); per_softirq_sums[j] += softirq_stat; sum_softirq += softirq_stat; } } sum += arch_irq_stat(); seq_put_decimal_ull(p, "cpu ", nsec_to_clock_t(user)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(system)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(iowait)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(irq)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(softirq)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice)); seq_putc(p, '\n'); for_each_online_cpu(i) { /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ user = kcpustat_cpu(i).cpustat[CPUTIME_USER]; nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE]; system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; idle = get_idle_time(i); iowait = get_iowait_time(i); irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; seq_printf(p, "cpu%d", i); seq_put_decimal_ull(p, " ", nsec_to_clock_t(user)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(system)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(iowait)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(irq)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(softirq)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice)); seq_putc(p, '\n'); } seq_put_decimal_ull(p, "intr ", (unsigned long long)sum); /* sum again ? it could be updated? */ for_each_irq_nr(j) seq_put_decimal_ull(p, " ", kstat_irqs_usr(j)); seq_printf(p, "\nctxt %llu\n" "btime %llu\n" "processes %lu\n" "procs_running %lu\n" "procs_blocked %lu\n", nr_context_switches(), (unsigned long long)boottime.tv_sec, total_forks, nr_running(), nr_iowait()); seq_put_decimal_ull(p, "softirq ", (unsigned long long)sum_softirq); for (i = 0; i < NR_SOFTIRQS; i++) seq_put_decimal_ull(p, " ", per_softirq_sums[i]); seq_putc(p, '\n'); return 0; } static int stat_open(struct inode *inode, struct file *file) { unsigned int size = 1024 + 128 * num_online_cpus(); /* minimum size to display an interrupt count : 2 bytes */ size += 2 * nr_irqs; return single_open_size(file, show_stat, NULL, size); } static const struct file_operations proc_stat_operations = { .open = stat_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; static int __init proc_stat_init(void) { proc_create("stat", 0, NULL, &proc_stat_operations); return 0; } fs_initcall(proc_stat_init); |
221 135 8 4 8 5 3 8 8 33 27 7 4 386 386 386 386 385 10 8 13 2 372 372 5 8 33 29 372 386 17 369 353 295 260 353 351 352 291 291 262 205 352 353 353 260 295 353 353 319 297 291 324 82 292 294 292 294 294 292 292 15 1 3 2 13 293 2 354 352 353 353 354 326 353 32 32 32 350 146 292 58 325 293 294 93 352 279 279 164 139 260 252 6 3 208 207 74 135 7 129 126 207 206 3 208 398 382 26 382 260 2 260 206 30 156 173 145 82 82 82 82 65 7 255 255 255 237 234 96 96 96 91 64 53 64 43 96 18 5 13 17 16 3 3 2 1 2 2 2 1 6 5 6 3 3 139 139 14 10 10 10 10 10 11 11 10 10 5 14 30 26 24 26 26 26 5 26 26 2 21 128 125 22 128 77 76 28 77 12 12 12 12 3 12 12 12 4 4 2 2 2 9 9 8 9 31 31 7 7 4 7 7 7 5 4 4 4 7 7 7 5 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 | /* * net/9p/clnt.c * * 9P Client * * Copyright (C) 2008 by Eric Van Hensbergen <ericvh@gmail.com> * Copyright (C) 2007 by Latchesar Ionkov <lucho@ionkov.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to: * Free Software Foundation * 51 Franklin Street, Fifth Floor * Boston, MA 02111-1301 USA * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/poll.h> #include <linux/idr.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/uaccess.h> #include <linux/uio.h> #include <net/9p/9p.h> #include <linux/parser.h> #include <linux/seq_file.h> #include <net/9p/client.h> #include <net/9p/transport.h> #include "protocol.h" #define CREATE_TRACE_POINTS #include <trace/events/9p.h> /* * Client Option Parsing (code inspired by NFS code) * - a little lazy - parse all client options */ enum { Opt_msize, Opt_trans, Opt_legacy, Opt_version, Opt_err, }; static const match_table_t tokens = { {Opt_msize, "msize=%u"}, {Opt_legacy, "noextend"}, {Opt_trans, "trans=%s"}, {Opt_version, "version=%s"}, {Opt_err, NULL}, }; inline int p9_is_proto_dotl(struct p9_client *clnt) { return clnt->proto_version == p9_proto_2000L; } EXPORT_SYMBOL(p9_is_proto_dotl); inline int p9_is_proto_dotu(struct p9_client *clnt) { return clnt->proto_version == p9_proto_2000u; } EXPORT_SYMBOL(p9_is_proto_dotu); int p9_show_client_options(struct seq_file *m, struct p9_client *clnt) { if (clnt->msize != 8192) seq_printf(m, ",msize=%u", clnt->msize); seq_printf(m, ",trans=%s", clnt->trans_mod->name); switch (clnt->proto_version) { case p9_proto_legacy: seq_puts(m, ",noextend"); break; case p9_proto_2000u: seq_puts(m, ",version=9p2000.u"); break; case p9_proto_2000L: /* Default */ break; } if (clnt->trans_mod->show_options) return clnt->trans_mod->show_options(m, clnt); return 0; } EXPORT_SYMBOL(p9_show_client_options); /* * Some error codes are taken directly from the server replies, * make sure they are valid. */ static int safe_errno(int err) { if ((err > 0) || (err < -MAX_ERRNO)) { p9_debug(P9_DEBUG_ERROR, "Invalid error code %d\n", err); return -EPROTO; } return err; } /* Interpret mount option for protocol version */ static int get_protocol_version(char *s) { int version = -EINVAL; if (!strcmp(s, "9p2000")) { version = p9_proto_legacy; p9_debug(P9_DEBUG_9P, "Protocol version: Legacy\n"); } else if (!strcmp(s, "9p2000.u")) { version = p9_proto_2000u; p9_debug(P9_DEBUG_9P, "Protocol version: 9P2000.u\n"); } else if (!strcmp(s, "9p2000.L")) { version = p9_proto_2000L; p9_debug(P9_DEBUG_9P, "Protocol version: 9P2000.L\n"); } else pr_info("Unknown protocol version %s\n", s); return version; } /** * parse_options - parse mount options into client structure * @opts: options string passed from mount * @clnt: existing v9fs client information * * Return 0 upon success, -ERRNO upon failure */ static int parse_opts(char *opts, struct p9_client *clnt) { char *options, *tmp_options; char *p; substring_t args[MAX_OPT_ARGS]; int option; char *s; int ret = 0; clnt->proto_version = p9_proto_2000L; clnt->msize = 8192; if (!opts) return 0; tmp_options = kstrdup(opts, GFP_KERNEL); if (!tmp_options) { p9_debug(P9_DEBUG_ERROR, "failed to allocate copy of option string\n"); return -ENOMEM; } options = tmp_options; while ((p = strsep(&options, ",")) != NULL) { int token, r; if (!*p) continue; token = match_token(p, tokens, args); switch (token) { case Opt_msize: r = match_int(&args[0], &option); if (r < 0) { p9_debug(P9_DEBUG_ERROR, "integer field, but no integer?\n"); ret = r; continue; } if (option < 4096) { p9_debug(P9_DEBUG_ERROR, "msize should be at least 4k\n"); ret = -EINVAL; continue; } clnt->msize = option; break; case Opt_trans: s = match_strdup(&args[0]); if (!s) { ret = -ENOMEM; p9_debug(P9_DEBUG_ERROR, "problem allocating copy of trans arg\n"); goto free_and_return; } v9fs_put_trans(clnt->trans_mod); clnt->trans_mod = v9fs_get_trans_by_name(s); if (clnt->trans_mod == NULL) { pr_info("Could not find request transport: %s\n", s); ret = -EINVAL; } kfree(s); break; case Opt_legacy: clnt->proto_version = p9_proto_legacy; break; case Opt_version: s = match_strdup(&args[0]); if (!s) { ret = -ENOMEM; p9_debug(P9_DEBUG_ERROR, "problem allocating copy of version arg\n"); goto free_and_return; } r = get_protocol_version(s); if (r < 0) ret = r; else clnt->proto_version = r; kfree(s); break; default: continue; } } free_and_return: if (ret) v9fs_put_trans(clnt->trans_mod); kfree(tmp_options); return ret; } static int p9_fcall_init(struct p9_client *c, struct p9_fcall *fc, int alloc_msize) { if (likely(c->fcall_cache) && alloc_msize == c->msize) { fc->sdata = kmem_cache_alloc(c->fcall_cache, GFP_NOFS); fc->cache = c->fcall_cache; } else { fc->sdata = kmalloc(alloc_msize, GFP_NOFS); fc->cache = NULL; } if (!fc->sdata) return -ENOMEM; fc->capacity = alloc_msize; return 0; } void p9_fcall_fini(struct p9_fcall *fc) { /* sdata can be NULL for interrupted requests in trans_rdma, * and kmem_cache_free does not do NULL-check for us */ if (unlikely(!fc->sdata)) return; if (fc->cache) kmem_cache_free(fc->cache, fc->sdata); else kfree(fc->sdata); } EXPORT_SYMBOL(p9_fcall_fini); static struct kmem_cache *p9_req_cache; /** * p9_req_alloc - Allocate a new request. * @c: Client session. * @type: Transaction type. * @max_size: Maximum packet size for this request. * * Context: Process context. * Return: Pointer to new request. */ static struct p9_req_t * p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size) { struct p9_req_t *req = kmem_cache_alloc(p9_req_cache, GFP_NOFS); int alloc_msize = min(c->msize, max_size); int tag; if (!req) return ERR_PTR(-ENOMEM); if (p9_fcall_init(c, &req->tc, alloc_msize)) goto free_req; if (p9_fcall_init(c, &req->rc, alloc_msize)) goto free; p9pdu_reset(&req->tc); p9pdu_reset(&req->rc); req->t_err = 0; req->status = REQ_STATUS_ALLOC; init_waitqueue_head(&req->wq); INIT_LIST_HEAD(&req->req_list); idr_preload(GFP_NOFS); spin_lock_irq(&c->lock); if (type == P9_TVERSION) tag = idr_alloc(&c->reqs, req, P9_NOTAG, P9_NOTAG + 1, GFP_NOWAIT); else tag = idr_alloc(&c->reqs, req, 0, P9_NOTAG, GFP_NOWAIT); req->tc.tag = tag; spin_unlock_irq(&c->lock); idr_preload_end(); if (tag < 0) goto free; /* Init ref to two because in the general case there is one ref * that is put asynchronously by a writer thread, one ref * temporarily given by p9_tag_lookup and put by p9_client_cb * in the recv thread, and one ref put by p9_tag_remove in the * main thread. The only exception is virtio that does not use * p9_tag_lookup but does not have a writer thread either * (the write happens synchronously in the request/zc_request * callback), so p9_client_cb eats the second ref there * as the pointer is duplicated directly by virtqueue_add_sgs() */ refcount_set(&req->refcount.refcount, 2); return req; free: p9_fcall_fini(&req->tc); p9_fcall_fini(&req->rc); free_req: kmem_cache_free(p9_req_cache, req); return ERR_PTR(-ENOMEM); } /** * p9_tag_lookup - Look up a request by tag. * @c: Client session. * @tag: Transaction ID. * * Context: Any context. * Return: A request, or %NULL if there is no request with that tag. */ struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag) { struct p9_req_t *req; rcu_read_lock(); again: req = idr_find(&c->reqs, tag); if (req) { /* We have to be careful with the req found under rcu_read_lock * Thanks to SLAB_TYPESAFE_BY_RCU we can safely try to get the * ref again without corrupting other data, then check again * that the tag matches once we have the ref */ if (!p9_req_try_get(req)) goto again; if (req->tc.tag != tag) { p9_req_put(req); goto again; } } rcu_read_unlock(); return req; } EXPORT_SYMBOL(p9_tag_lookup); /** * p9_tag_remove - Remove a tag. * @c: Client session. * @r: Request of reference. * * Context: Any context. */ static int p9_tag_remove(struct p9_client *c, struct p9_req_t *r) { unsigned long flags; u16 tag = r->tc.tag; p9_debug(P9_DEBUG_MUX, "clnt %p req %p tag: %d\n", c, r, tag); spin_lock_irqsave(&c->lock, flags); idr_remove(&c->reqs, tag); spin_unlock_irqrestore(&c->lock, flags); return p9_req_put(r); } static void p9_req_free(struct kref *ref) { struct p9_req_t *r = container_of(ref, struct p9_req_t, refcount); p9_fcall_fini(&r->tc); p9_fcall_fini(&r->rc); kmem_cache_free(p9_req_cache, r); } int p9_req_put(struct p9_req_t *r) { return kref_put(&r->refcount, p9_req_free); } EXPORT_SYMBOL(p9_req_put); /** * p9_tag_cleanup - cleans up tags structure and reclaims resources * @c: v9fs client struct * * This frees resources associated with the tags structure * */ static void p9_tag_cleanup(struct p9_client *c) { struct p9_req_t *req; int id; rcu_read_lock(); idr_for_each_entry(&c->reqs, req, id) { pr_info("Tag %d still in use\n", id); if (p9_tag_remove(c, req) == 0) pr_warn("Packet with tag %d has still references", req->tc.tag); } rcu_read_unlock(); } /** * p9_client_cb - call back from transport to client * c: client state * req: request received * */ void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status) { p9_debug(P9_DEBUG_MUX, " tag %d\n", req->tc.tag); /* * This barrier is needed to make sure any change made to req before * the status change is visible to another thread */ smp_wmb(); req->status = status; wake_up(&req->wq); p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc.tag); p9_req_put(req); } EXPORT_SYMBOL(p9_client_cb); /** * p9_parse_header - parse header arguments out of a packet * @pdu: packet to parse * @size: size of packet * @type: type of request * @tag: tag of packet * @rewind: set if we need to rewind offset afterwards */ int p9_parse_header(struct p9_fcall *pdu, int32_t *size, int8_t *type, int16_t *tag, int rewind) { int8_t r_type; int16_t r_tag; int32_t r_size; int offset = pdu->offset; int err; pdu->offset = 0; err = p9pdu_readf(pdu, 0, "dbw", &r_size, &r_type, &r_tag); if (err) goto rewind_and_exit; if (type) *type = r_type; if (tag) *tag = r_tag; if (size) *size = r_size; if (pdu->size != r_size || r_size < 7) { err = -EINVAL; goto rewind_and_exit; } pdu->id = r_type; pdu->tag = r_tag; p9_debug(P9_DEBUG_9P, "<<< size=%d type: %d tag: %d\n", pdu->size, pdu->id, pdu->tag); rewind_and_exit: if (rewind) pdu->offset = offset; return err; } EXPORT_SYMBOL(p9_parse_header); /** * p9_check_errors - check 9p packet for error return and process it * @c: current client instance * @req: request to parse and check for error conditions * * returns error code if one is discovered, otherwise returns 0 * * this will have to be more complicated if we have multiple * error packet types */ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req) { int8_t type; int err; int ecode; err = p9_parse_header(&req->rc, NULL, &type, NULL, 0); if (req->rc.size >= c->msize) { p9_debug(P9_DEBUG_ERROR, "requested packet size too big: %d\n", req->rc.size); return -EIO; } /* * dump the response from server * This should be after check errors which poplulate pdu_fcall. */ trace_9p_protocol_dump(c, &req->rc); if (err) { p9_debug(P9_DEBUG_ERROR, "couldn't parse header %d\n", err); return err; } if (type != P9_RERROR && type != P9_RLERROR) return 0; if (!p9_is_proto_dotl(c)) { char *ename; err = p9pdu_readf(&req->rc, c->proto_version, "s?d", &ename, &ecode); if (err) goto out_err; if (p9_is_proto_dotu(c) && ecode < 512) err = -ecode; if (!err) { err = p9_errstr2errno(ename, strlen(ename)); p9_debug(P9_DEBUG_9P, "<<< RERROR (%d) %s\n", -ecode, ename); } kfree(ename); } else { err = p9pdu_readf(&req->rc, c->proto_version, "d", &ecode); if (err) goto out_err; err = -ecode; p9_debug(P9_DEBUG_9P, "<<< RLERROR (%d)\n", -ecode); } return err; out_err: p9_debug(P9_DEBUG_ERROR, "couldn't parse error%d\n", err); return err; } /** * p9_check_zc_errors - check 9p packet for error return and process it * @c: current client instance * @req: request to parse and check for error conditions * @in_hdrlen: Size of response protocol buffer. * * returns error code if one is discovered, otherwise returns 0 * * this will have to be more complicated if we have multiple * error packet types */ static int p9_check_zc_errors(struct p9_client *c, struct p9_req_t *req, struct iov_iter *uidata, int in_hdrlen) { int err; int ecode; int8_t type; char *ename = NULL; err = p9_parse_header(&req->rc, NULL, &type, NULL, 0); /* * dump the response from server * This should be after parse_header which poplulate pdu_fcall. */ trace_9p_protocol_dump(c, &req->rc); if (err) { p9_debug(P9_DEBUG_ERROR, "couldn't parse header %d\n", err); return err; } if (type != P9_RERROR && type != P9_RLERROR) return 0; if (!p9_is_proto_dotl(c)) { /* Error is reported in string format */ int len; /* 7 = header size for RERROR; */ int inline_len = in_hdrlen - 7; len = req->rc.size - req->rc.offset; if (len > (P9_ZC_HDR_SZ - 7)) { err = -EFAULT; goto out_err; } ename = &req->rc.sdata[req->rc.offset]; if (len > inline_len) { /* We have error in external buffer */ if (!copy_from_iter_full(ename + inline_len, len - inline_len, uidata)) { err = -EFAULT; goto out_err; } } ename = NULL; err = p9pdu_readf(&req->rc, c->proto_version, "s?d", &ename, &ecode); if (err) goto out_err; if (p9_is_proto_dotu(c) && ecode < 512) err = -ecode; if (!err) { err = p9_errstr2errno(ename, strlen(ename)); p9_debug(P9_DEBUG_9P, "<<< RERROR (%d) %s\n", -ecode, ename); } kfree(ename); } else { err = p9pdu_readf(&req->rc, c->proto_version, "d", &ecode); err = -ecode; p9_debug(P9_DEBUG_9P, "<<< RLERROR (%d)\n", -ecode); } return err; out_err: p9_debug(P9_DEBUG_ERROR, "couldn't parse error%d\n", err); return err; } static struct p9_req_t * p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...); /** * p9_client_flush - flush (cancel) a request * @c: client state * @oldreq: request to cancel * * This sents a flush for a particular request and links * the flush request to the original request. The current * code only supports a single flush request although the protocol * allows for multiple flush requests to be sent for a single request. * */ static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq) { struct p9_req_t *req; int16_t oldtag; int err; err = p9_parse_header(&oldreq->tc, NULL, NULL, &oldtag, 1); if (err) return err; p9_debug(P9_DEBUG_9P, ">>> TFLUSH tag %d\n", oldtag); req = p9_client_rpc(c, P9_TFLUSH, "w", oldtag); if (IS_ERR(req)) return PTR_ERR(req); /* * if we haven't received a response for oldreq, * remove it from the list */ if (oldreq->status == REQ_STATUS_SENT) { if (c->trans_mod->cancelled) c->trans_mod->cancelled(c, oldreq); } p9_tag_remove(c, req); return 0; } static struct p9_req_t *p9_client_prepare_req(struct p9_client *c, int8_t type, int req_size, const char *fmt, va_list ap) { int err; struct p9_req_t *req; p9_debug(P9_DEBUG_MUX, "client %p op %d\n", c, type); /* we allow for any status other than disconnected */ if (c->status == Disconnected) return ERR_PTR(-EIO); /* if status is begin_disconnected we allow only clunk request */ if ((c->status == BeginDisconnect) && (type != P9_TCLUNK)) return ERR_PTR(-EIO); req = p9_tag_alloc(c, type, req_size); if (IS_ERR(req)) return req; /* marshall the data */ p9pdu_prepare(&req->tc, req->tc.tag, type); err = p9pdu_vwritef(&req->tc, c->proto_version, fmt, ap); if (err) goto reterr; p9pdu_finalize(c, &req->tc); trace_9p_client_req(c, type, req->tc.tag); return req; reterr: p9_tag_remove(c, req); /* We have to put also the 2nd reference as it won't be used */ p9_req_put(req); return ERR_PTR(err); } /** * p9_client_rpc - issue a request and wait for a response * @c: client session * @type: type of request * @fmt: protocol format string (see protocol.c) * * Returns request structure (which client must free using p9_tag_remove) */ static struct p9_req_t * p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...) { va_list ap; int sigpending, err; unsigned long flags; struct p9_req_t *req; va_start(ap, fmt); req = p9_client_prepare_req(c, type, c->msize, fmt, ap); va_end(ap); if (IS_ERR(req)) return req; if (signal_pending(current)) { sigpending = 1; clear_thread_flag(TIF_SIGPENDING); } else sigpending = 0; err = c->trans_mod->request(c, req); if (err < 0) { /* write won't happen */ p9_req_put(req); if (err != -ERESTARTSYS && err != -EFAULT) c->status = Disconnected; goto recalc_sigpending; } again: /* Wait for the response */ err = wait_event_killable(req->wq, req->status >= REQ_STATUS_RCVD); /* * Make sure our req is coherent with regard to updates in other * threads - echoes to wmb() in the callback */ smp_rmb(); if ((err == -ERESTARTSYS) && (c->status == Connected) && (type == P9_TFLUSH)) { sigpending = 1; clear_thread_flag(TIF_SIGPENDING); goto again; } if (req->status == REQ_STATUS_ERROR) { p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err); err = req->t_err; } if ((err == -ERESTARTSYS) && (c->status == Connected)) { p9_debug(P9_DEBUG_MUX, "flushing\n"); sigpending = 1; clear_thread_flag(TIF_SIGPENDING); if (c->trans_mod->cancel(c, req)) p9_client_flush(c, req); /* if we received the response anyway, don't signal error */ if (req->status == REQ_STATUS_RCVD) err = 0; } recalc_sigpending: if (sigpending) { spin_lock_irqsave(¤t->sighand->siglock, flags); recalc_sigpending(); spin_unlock_irqrestore(¤t->sighand->siglock, flags); } if (err < 0) goto reterr; err = p9_check_errors(c, req); trace_9p_client_res(c, type, req->rc.tag, err); if (!err) return req; reterr: p9_tag_remove(c, req); return ERR_PTR(safe_errno(err)); } /** * p9_client_zc_rpc - issue a request and wait for a response * @c: client session * @type: type of request * @uidata: destination for zero copy read * @uodata: source for zero copy write * @inlen: read buffer size * @olen: write buffer size * @hdrlen: reader header size, This is the size of response protocol data * @fmt: protocol format string (see protocol.c) * * Returns request structure (which client must free using p9_tag_remove) */ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type, struct iov_iter *uidata, struct iov_iter *uodata, int inlen, int olen, int in_hdrlen, const char *fmt, ...) { va_list ap; int sigpending, err; unsigned long flags; struct p9_req_t *req; va_start(ap, fmt); /* * We allocate a inline protocol data of only 4k bytes. * The actual content is passed in zero-copy fashion. */ req = p9_client_prepare_req(c, type, P9_ZC_HDR_SZ, fmt, ap); va_end(ap); if (IS_ERR(req)) return req; if (signal_pending(current)) { sigpending = 1; clear_thread_flag(TIF_SIGPENDING); } else sigpending = 0; err = c->trans_mod->zc_request(c, req, uidata, uodata, inlen, olen, in_hdrlen); if (err < 0) { if (err == -EIO) c->status = Disconnected; if (err != -ERESTARTSYS) goto recalc_sigpending; } if (req->status == REQ_STATUS_ERROR) { p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err); err = req->t_err; } if ((err == -ERESTARTSYS) && (c->status == Connected)) { p9_debug(P9_DEBUG_MUX, "flushing\n"); sigpending = 1; clear_thread_flag(TIF_SIGPENDING); if (c->trans_mod->cancel(c, req)) p9_client_flush(c, req); /* if we received the response anyway, don't signal error */ if (req->status == REQ_STATUS_RCVD) err = 0; } recalc_sigpending: if (sigpending) { spin_lock_irqsave(¤t->sighand->siglock, flags); recalc_sigpending(); spin_unlock_irqrestore(¤t->sighand->siglock, flags); } if (err < 0) goto reterr; err = p9_check_zc_errors(c, req, uidata, in_hdrlen); trace_9p_client_res(c, type, req->rc.tag, err); if (!err) return req; reterr: p9_tag_remove(c, req); return ERR_PTR(safe_errno(err)); } static struct p9_fid *p9_fid_create(struct p9_client *clnt) { int ret; struct p9_fid *fid; p9_debug(P9_DEBUG_FID, "clnt %p\n", clnt); fid = kzalloc(sizeof(struct p9_fid), GFP_KERNEL); if (!fid) return NULL; fid->mode = -1; fid->uid = current_fsuid(); fid->clnt = clnt; idr_preload(GFP_KERNEL); spin_lock_irq(&clnt->lock); ret = idr_alloc_u32(&clnt->fids, fid, &fid->fid, P9_NOFID - 1, GFP_NOWAIT); spin_unlock_irq(&clnt->lock); idr_preload_end(); if (!ret) return fid; kfree(fid); return NULL; } static void p9_fid_destroy(struct p9_fid *fid) { struct p9_client *clnt; unsigned long flags; p9_debug(P9_DEBUG_FID, "fid %d\n", fid->fid); clnt = fid->clnt; spin_lock_irqsave(&clnt->lock, flags); idr_remove(&clnt->fids, fid->fid); spin_unlock_irqrestore(&clnt->lock, flags); kfree(fid->rdir); kfree(fid); } static int p9_client_version(struct p9_client *c) { int err = 0; struct p9_req_t *req; char *version = NULL; int msize; p9_debug(P9_DEBUG_9P, ">>> TVERSION msize %d protocol %d\n", c->msize, c->proto_version); switch (c->proto_version) { case p9_proto_2000L: req = p9_client_rpc(c, P9_TVERSION, "ds", c->msize, "9P2000.L"); break; case p9_proto_2000u: req = p9_client_rpc(c, P9_TVERSION, "ds", c->msize, "9P2000.u"); break; case p9_proto_legacy: req = p9_client_rpc(c, P9_TVERSION, "ds", c->msize, "9P2000"); break; default: return -EINVAL; } if (IS_ERR(req)) return PTR_ERR(req); err = p9pdu_readf(&req->rc, c->proto_version, "ds", &msize, &version); if (err) { p9_debug(P9_DEBUG_9P, "version error %d\n", err); trace_9p_protocol_dump(c, &req->rc); goto error; } p9_debug(P9_DEBUG_9P, "<<< RVERSION msize %d %s\n", msize, version); if (!strncmp(version, "9P2000.L", 8)) c->proto_version = p9_proto_2000L; else if (!strncmp(version, "9P2000.u", 8)) c->proto_version = p9_proto_2000u; else if (!strncmp(version, "9P2000", 6)) c->proto_version = p9_proto_legacy; else { p9_debug(P9_DEBUG_ERROR, "server returned an unknown version: %s\n", version); err = -EREMOTEIO; goto error; } if (msize < 4096) { p9_debug(P9_DEBUG_ERROR, "server returned a msize < 4096: %d\n", msize); err = -EREMOTEIO; goto error; } if (msize < c->msize) c->msize = msize; error: kfree(version); p9_tag_remove(c, req); return err; } struct p9_client *p9_client_create(const char *dev_name, char *options) { int err; struct p9_client *clnt; char *client_id; err = 0; clnt = kmalloc(sizeof(struct p9_client), GFP_KERNEL); if (!clnt) return ERR_PTR(-ENOMEM); clnt->trans_mod = NULL; clnt->trans = NULL; clnt->fcall_cache = NULL; client_id = utsname()->nodename; memcpy(clnt->name, client_id, strlen(client_id) + 1); spin_lock_init(&clnt->lock); idr_init(&clnt->fids); idr_init(&clnt->reqs); err = parse_opts(options, clnt); if (err < 0) goto free_client; if (!clnt->trans_mod) clnt->trans_mod = v9fs_get_default_trans(); if (clnt->trans_mod == NULL) { err = -EPROTONOSUPPORT; p9_debug(P9_DEBUG_ERROR, "No transport defined or default transport\n"); goto free_client; } p9_debug(P9_DEBUG_MUX, "clnt %p trans %p msize %d protocol %d\n", clnt, clnt->trans_mod, clnt->msize, clnt->proto_version); err = clnt->trans_mod->create(clnt, dev_name, options); if (err) goto put_trans; if (clnt->msize > clnt->trans_mod->maxsize) clnt->msize = clnt->trans_mod->maxsize; if (clnt->msize < 4096) { p9_debug(P9_DEBUG_ERROR, "Please specify a msize of at least 4k\n"); err = -EINVAL; goto close_trans; } err = p9_client_version(clnt); if (err) goto close_trans; /* P9_HDRSZ + 4 is the smallest packet header we can have that is * followed by data accessed from userspace by read */ clnt->fcall_cache = kmem_cache_create_usercopy("9p-fcall-cache", clnt->msize, 0, 0, P9_HDRSZ + 4, clnt->msize - (P9_HDRSZ + 4), NULL); return clnt; close_trans: clnt->trans_mod->close(clnt); put_trans: v9fs_put_trans(clnt->trans_mod); free_client: kfree(clnt); return ERR_PTR(err); } EXPORT_SYMBOL(p9_client_create); void p9_client_destroy(struct p9_client *clnt) { struct p9_fid *fid; int id; p9_debug(P9_DEBUG_MUX, "clnt %p\n", clnt); if (clnt->trans_mod) clnt->trans_mod->close(clnt); v9fs_put_trans(clnt->trans_mod); idr_for_each_entry(&clnt->fids, fid, id) { pr_info("Found fid %d not clunked\n", fid->fid); p9_fid_destroy(fid); } p9_tag_cleanup(clnt); kmem_cache_destroy(clnt->fcall_cache); kfree(clnt); } EXPORT_SYMBOL(p9_client_destroy); void p9_client_disconnect(struct p9_client *clnt) { p9_debug(P9_DEBUG_9P, "clnt %p\n", clnt); clnt->status = Disconnected; } EXPORT_SYMBOL(p9_client_disconnect); void p9_client_begin_disconnect(struct p9_client *clnt) { p9_debug(P9_DEBUG_9P, "clnt %p\n", clnt); clnt->status = BeginDisconnect; } EXPORT_SYMBOL(p9_client_begin_disconnect); struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid, const char *uname, kuid_t n_uname, const char *aname) { int err = 0; struct p9_req_t *req; struct p9_fid *fid; struct p9_qid qid; p9_debug(P9_DEBUG_9P, ">>> TATTACH afid %d uname %s aname %s\n", afid ? afid->fid : -1, uname, aname); fid = p9_fid_create(clnt); if (!fid) { err = -ENOMEM; goto error; } fid->uid = n_uname; req = p9_client_rpc(clnt, P9_TATTACH, "ddss?u", fid->fid, afid ? afid->fid : P9_NOFID, uname, aname, n_uname); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", &qid); if (err) { trace_9p_protocol_dump(clnt, &req->rc); p9_tag_remove(clnt, req); goto error; } p9_debug(P9_DEBUG_9P, "<<< RATTACH qid %x.%llx.%x\n", qid.type, (unsigned long long)qid.path, qid.version); memmove(&fid->qid, &qid, sizeof(struct p9_qid)); p9_tag_remove(clnt, req); return fid; error: if (fid) p9_fid_destroy(fid); return ERR_PTR(err); } EXPORT_SYMBOL(p9_client_attach); struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname, const unsigned char * const *wnames, int clone) { int err; struct p9_client *clnt; struct p9_fid *fid; struct p9_qid *wqids; struct p9_req_t *req; uint16_t nwqids, count; err = 0; wqids = NULL; clnt = oldfid->clnt; if (clone) { fid = p9_fid_create(clnt); if (!fid) { err = -ENOMEM; goto error; } fid->uid = oldfid->uid; } else fid = oldfid; p9_debug(P9_DEBUG_9P, ">>> TWALK fids %d,%d nwname %ud wname[0] %s\n", oldfid->fid, fid->fid, nwname, wnames ? wnames[0] : NULL); req = p9_client_rpc(clnt, P9_TWALK, "ddT", oldfid->fid, fid->fid, nwname, wnames); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } err = p9pdu_readf(&req->rc, clnt->proto_version, "R", &nwqids, &wqids); if (err) { trace_9p_protocol_dump(clnt, &req->rc); p9_tag_remove(clnt, req); goto clunk_fid; } p9_tag_remove(clnt, req); p9_debug(P9_DEBUG_9P, "<<< RWALK nwqid %d:\n", nwqids); if (nwqids != nwname) { err = -ENOENT; goto clunk_fid; } for (count = 0; count < nwqids; count++) p9_debug(P9_DEBUG_9P, "<<< [%d] %x.%llx.%x\n", count, wqids[count].type, (unsigned long long)wqids[count].path, wqids[count].version); if (nwname) memmove(&fid->qid, &wqids[nwqids - 1], sizeof(struct p9_qid)); else fid->qid = oldfid->qid; kfree(wqids); return fid; clunk_fid: kfree(wqids); p9_client_clunk(fid); fid = NULL; error: if (fid && (fid != oldfid)) p9_fid_destroy(fid); return ERR_PTR(err); } EXPORT_SYMBOL(p9_client_walk); int p9_client_open(struct p9_fid *fid, int mode) { int err; struct p9_client *clnt; struct p9_req_t *req; struct p9_qid qid; int iounit; clnt = fid->clnt; p9_debug(P9_DEBUG_9P, ">>> %s fid %d mode %d\n", p9_is_proto_dotl(clnt) ? "TLOPEN" : "TOPEN", fid->fid, mode); err = 0; if (fid->mode != -1) return -EINVAL; if (p9_is_proto_dotl(clnt)) req = p9_client_rpc(clnt, P9_TLOPEN, "dd", fid->fid, mode); else req = p9_client_rpc(clnt, P9_TOPEN, "db", fid->fid, mode); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } err = p9pdu_readf(&req->rc, clnt->proto_version, "Qd", &qid, &iounit); if (err) { trace_9p_protocol_dump(clnt, &req->rc); goto free_and_error; } p9_debug(P9_DEBUG_9P, "<<< %s qid %x.%llx.%x iounit %x\n", p9_is_proto_dotl(clnt) ? "RLOPEN" : "ROPEN", qid.type, (unsigned long long)qid.path, qid.version, iounit); fid->mode = mode; fid->iounit = iounit; free_and_error: p9_tag_remove(clnt, req); error: return err; } EXPORT_SYMBOL(p9_client_open); int p9_client_create_dotl(struct p9_fid *ofid, const char *name, u32 flags, u32 mode, kgid_t gid, struct p9_qid *qid) { int err = 0; struct p9_client *clnt; struct p9_req_t *req; int iounit; p9_debug(P9_DEBUG_9P, ">>> TLCREATE fid %d name %s flags %d mode %d gid %d\n", ofid->fid, name, flags, mode, from_kgid(&init_user_ns, gid)); clnt = ofid->clnt; if (ofid->mode != -1) return -EINVAL; req = p9_client_rpc(clnt, P9_TLCREATE, "dsddg", ofid->fid, name, flags, mode, gid); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } err = p9pdu_readf(&req->rc, clnt->proto_version, "Qd", qid, &iounit); if (err) { trace_9p_protocol_dump(clnt, &req->rc); goto free_and_error; } p9_debug(P9_DEBUG_9P, "<<< RLCREATE qid %x.%llx.%x iounit %x\n", qid->type, (unsigned long long)qid->path, qid->version, iounit); ofid->mode = mode; ofid->iounit = iounit; free_and_error: p9_tag_remove(clnt, req); error: return err; } EXPORT_SYMBOL(p9_client_create_dotl); int p9_client_fcreate(struct p9_fid *fid, const char *name, u32 perm, int mode, char *extension) { int err; struct p9_client *clnt; struct p9_req_t *req; struct p9_qid qid; int iounit; p9_debug(P9_DEBUG_9P, ">>> TCREATE fid %d name %s perm %d mode %d\n", fid->fid, name, perm, mode); err = 0; clnt = fid->clnt; if (fid->mode != -1) return -EINVAL; req = p9_client_rpc(clnt, P9_TCREATE, "dsdb?s", fid->fid, name, perm, mode, extension); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } err = p9pdu_readf(&req->rc, clnt->proto_version, "Qd", &qid, &iounit); if (err) { trace_9p_protocol_dump(clnt, &req->rc); goto free_and_error; } p9_debug(P9_DEBUG_9P, "<<< RCREATE qid %x.%llx.%x iounit %x\n", qid.type, (unsigned long long)qid.path, qid.version, iounit); fid->mode = mode; fid->iounit = iounit; free_and_error: p9_tag_remove(clnt, req); error: return err; } EXPORT_SYMBOL(p9_client_fcreate); int p9_client_symlink(struct p9_fid *dfid, const char *name, const char *symtgt, kgid_t gid, struct p9_qid *qid) { int err = 0; struct p9_client *clnt; struct p9_req_t *req; p9_debug(P9_DEBUG_9P, ">>> TSYMLINK dfid %d name %s symtgt %s\n", dfid->fid, name, symtgt); clnt = dfid->clnt; req = p9_client_rpc(clnt, P9_TSYMLINK, "dssg", dfid->fid, name, symtgt, gid); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", qid); if (err) { trace_9p_protocol_dump(clnt, &req->rc); goto free_and_error; } p9_debug(P9_DEBUG_9P, "<<< RSYMLINK qid %x.%llx.%x\n", qid->type, (unsigned long long)qid->path, qid->version); free_and_error: p9_tag_remove(clnt, req); error: return err; } EXPORT_SYMBOL(p9_client_symlink); int p9_client_link(struct p9_fid *dfid, struct p9_fid *oldfid, const char *newname) { struct p9_client *clnt; struct p9_req_t *req; p9_debug(P9_DEBUG_9P, ">>> TLINK dfid %d oldfid %d newname %s\n", dfid->fid, oldfid->fid, newname); clnt = dfid->clnt; req = p9_client_rpc(clnt, P9_TLINK, "dds", dfid->fid, oldfid->fid, newname); if (IS_ERR(req)) return PTR_ERR(req); p9_debug(P9_DEBUG_9P, "<<< RLINK\n"); p9_tag_remove(clnt, req); return 0; } EXPORT_SYMBOL(p9_client_link); int p9_client_fsync(struct p9_fid *fid, int datasync) { int err; struct p9_client *clnt; struct p9_req_t *req; p9_debug(P9_DEBUG_9P, ">>> TFSYNC fid %d datasync:%d\n", fid->fid, datasync); err = 0; clnt = fid->clnt; req = p9_client_rpc(clnt, P9_TFSYNC, "dd", fid->fid, datasync); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } p9_debug(P9_DEBUG_9P, "<<< RFSYNC fid %d\n", fid->fid); p9_tag_remove(clnt, req); error: return err; } EXPORT_SYMBOL(p9_client_fsync); int p9_client_clunk(struct p9_fid *fid) { int err; struct p9_client *clnt; struct p9_req_t *req; int retries = 0; if (!fid) { pr_warn("%s (%d): Trying to clunk with NULL fid\n", __func__, task_pid_nr(current)); dump_stack(); return 0; } again: p9_debug(P9_DEBUG_9P, ">>> TCLUNK fid %d (try %d)\n", fid->fid, retries); err = 0; clnt = fid->clnt; req = p9_client_rpc(clnt, P9_TCLUNK, "d", fid->fid); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } p9_debug(P9_DEBUG_9P, "<<< RCLUNK fid %d\n", fid->fid); p9_tag_remove(clnt, req); error: /* * Fid is not valid even after a failed clunk * If interrupted, retry once then give up and * leak fid until umount. */ if (err == -ERESTARTSYS) { if (retries++ == 0) goto again; } else p9_fid_destroy(fid); return err; } EXPORT_SYMBOL(p9_client_clunk); int p9_client_remove(struct p9_fid *fid) { int err; struct p9_client *clnt; struct p9_req_t *req; p9_debug(P9_DEBUG_9P, ">>> TREMOVE fid %d\n", fid->fid); err = 0; clnt = fid->clnt; req = p9_client_rpc(clnt, P9_TREMOVE, "d", fid->fid); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } p9_debug(P9_DEBUG_9P, "<<< RREMOVE fid %d\n", fid->fid); p9_tag_remove(clnt, req); error: if (err == -ERESTARTSYS) p9_client_clunk(fid); else p9_fid_destroy(fid); return err; } EXPORT_SYMBOL(p9_client_remove); int p9_client_unlinkat(struct p9_fid *dfid, const char *name, int flags) { int err = 0; struct p9_req_t *req; struct p9_client *clnt; p9_debug(P9_DEBUG_9P, ">>> TUNLINKAT fid %d %s %d\n", dfid->fid, name, flags); clnt = dfid->clnt; req = p9_client_rpc(clnt, P9_TUNLINKAT, "dsd", dfid->fid, name, flags); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } p9_debug(P9_DEBUG_9P, "<<< RUNLINKAT fid %d %s\n", dfid->fid, name); p9_tag_remove(clnt, req); error: return err; } EXPORT_SYMBOL(p9_client_unlinkat); int p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err) { struct p9_client *clnt = fid->clnt; struct p9_req_t *req; int total = 0; *err = 0; p9_debug(P9_DEBUG_9P, ">>> TREAD fid %d offset %llu %d\n", fid->fid, (unsigned long long) offset, (int)iov_iter_count(to)); while (iov_iter_count(to)) { int count = iov_iter_count(to); int rsize, non_zc = 0; char *dataptr; rsize = fid->iounit; if (!rsize || rsize > clnt->msize-P9_IOHDRSZ) rsize = clnt->msize - P9_IOHDRSZ; if (count < rsize) rsize = count; /* Don't bother zerocopy for small IO (< 1024) */ if (clnt->trans_mod->zc_request && rsize > 1024) { /* * response header len is 11 * PDU Header(7) + IO Size (4) */ req = p9_client_zc_rpc(clnt, P9_TREAD, to, NULL, rsize, 0, 11, "dqd", fid->fid, offset, rsize); } else { non_zc = 1; req = p9_client_rpc(clnt, P9_TREAD, "dqd", fid->fid, offset, rsize); } if (IS_ERR(req)) { *err = PTR_ERR(req); break; } *err = p9pdu_readf(&req->rc, clnt->proto_version, "D", &count, &dataptr); if (*err) { trace_9p_protocol_dump(clnt, &req->rc); p9_tag_remove(clnt, req); break; } if (rsize < count) { pr_err("bogus RREAD count (%d > %d)\n", count, rsize); count = rsize; } p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", count); if (!count) { p9_tag_remove(clnt, req); break; } if (non_zc) { int n = copy_to_iter(dataptr, count, to); total += n; offset += n; if (n != count) { *err = -EFAULT; p9_tag_remove(clnt, req); break; } } else { iov_iter_advance(to, count); total += count; offset += count; } p9_tag_remove(clnt, req); } return total; } EXPORT_SYMBOL(p9_client_read); int p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err) { struct p9_client *clnt = fid->clnt; struct p9_req_t *req; int total = 0; *err = 0; p9_debug(P9_DEBUG_9P, ">>> TWRITE fid %d offset %llu count %zd\n", fid->fid, (unsigned long long) offset, iov_iter_count(from)); while (iov_iter_count(from)) { int count = iov_iter_count(from); int rsize = fid->iounit; if (!rsize || rsize > clnt->msize-P9_IOHDRSZ) rsize = clnt->msize - P9_IOHDRSZ; if (count < rsize) rsize = count; /* Don't bother zerocopy for small IO (< 1024) */ if (clnt->trans_mod->zc_request && rsize > 1024) { req = p9_client_zc_rpc(clnt, P9_TWRITE, NULL, from, 0, rsize, P9_ZC_HDR_SZ, "dqd", fid->fid, offset, rsize); } else { req = p9_client_rpc(clnt, P9_TWRITE, "dqV", fid->fid, offset, rsize, from); } if (IS_ERR(req)) { *err = PTR_ERR(req); break; } *err = p9pdu_readf(&req->rc, clnt->proto_version, "d", &count); if (*err) { trace_9p_protocol_dump(clnt, &req->rc); p9_tag_remove(clnt, req); break; } if (rsize < count) { pr_err("bogus RWRITE count (%d > %d)\n", count, rsize); count = rsize; } p9_debug(P9_DEBUG_9P, "<<< RWRITE count %d\n", count); p9_tag_remove(clnt, req); iov_iter_advance(from, count); total += count; offset += count; } return total; } EXPORT_SYMBOL(p9_client_write); struct p9_wstat *p9_client_stat(struct p9_fid *fid) { int err; struct p9_client *clnt; struct p9_wstat *ret = kmalloc(sizeof(struct p9_wstat), GFP_KERNEL); struct p9_req_t *req; u16 ignored; p9_debug(P9_DEBUG_9P, ">>> TSTAT fid %d\n", fid->fid); if (!ret) return ERR_PTR(-ENOMEM); err = 0; clnt = fid->clnt; req = p9_client_rpc(clnt, P9_TSTAT, "d", fid->fid); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } err = p9pdu_readf(&req->rc, clnt->proto_version, "wS", &ignored, ret); if (err) { trace_9p_protocol_dump(clnt, &req->rc); p9_tag_remove(clnt, req); goto error; } p9_debug(P9_DEBUG_9P, "<<< RSTAT sz=%x type=%x dev=%x qid=%x.%llx.%x\n" "<<< mode=%8.8x atime=%8.8x mtime=%8.8x length=%llx\n" "<<< name=%s uid=%s gid=%s muid=%s extension=(%s)\n" "<<< uid=%d gid=%d n_muid=%d\n", ret->size, ret->type, ret->dev, ret->qid.type, (unsigned long long)ret->qid.path, ret->qid.version, ret->mode, ret->atime, ret->mtime, (unsigned long long)ret->length, ret->name, ret->uid, ret->gid, ret->muid, ret->extension, from_kuid(&init_user_ns, ret->n_uid), from_kgid(&init_user_ns, ret->n_gid), from_kuid(&init_user_ns, ret->n_muid)); p9_tag_remove(clnt, req); return ret; error: kfree(ret); return ERR_PTR(err); } EXPORT_SYMBOL(p9_client_stat); struct p9_stat_dotl *p9_client_getattr_dotl(struct p9_fid *fid, u64 request_mask) { int err; struct p9_client *clnt; struct p9_stat_dotl *ret = kmalloc(sizeof(struct p9_stat_dotl), GFP_KERNEL); struct p9_req_t *req; p9_debug(P9_DEBUG_9P, ">>> TGETATTR fid %d, request_mask %lld\n", fid->fid, request_mask); if (!ret) return ERR_PTR(-ENOMEM); err = 0; clnt = fid->clnt; req = p9_client_rpc(clnt, P9_TGETATTR, "dq", fid->fid, request_mask); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } err = p9pdu_readf(&req->rc, clnt->proto_version, "A", ret); if (err) { trace_9p_protocol_dump(clnt, &req->rc); p9_tag_remove(clnt, req); goto error; } p9_debug(P9_DEBUG_9P, "<<< RGETATTR st_result_mask=%lld\n" "<<< qid=%x.%llx.%x\n" "<<< st_mode=%8.8x st_nlink=%llu\n" "<<< st_uid=%d st_gid=%d\n" "<<< st_rdev=%llx st_size=%llx st_blksize=%llu st_blocks=%llu\n" "<<< st_atime_sec=%lld st_atime_nsec=%lld\n" "<<< st_mtime_sec=%lld st_mtime_nsec=%lld\n" "<<< st_ctime_sec=%lld st_ctime_nsec=%lld\n" "<<< st_btime_sec=%lld st_btime_nsec=%lld\n" "<<< st_gen=%lld st_data_version=%lld\n", ret->st_result_mask, ret->qid.type, ret->qid.path, ret->qid.version, ret->st_mode, ret->st_nlink, from_kuid(&init_user_ns, ret->st_uid), from_kgid(&init_user_ns, ret->st_gid), ret->st_rdev, ret->st_size, ret->st_blksize, ret->st_blocks, ret->st_atime_sec, ret->st_atime_nsec, ret->st_mtime_sec, ret->st_mtime_nsec, ret->st_ctime_sec, ret->st_ctime_nsec, ret->st_btime_sec, ret->st_btime_nsec, ret->st_gen, ret->st_data_version); p9_tag_remove(clnt, req); return ret; error: kfree(ret); return ERR_PTR(err); } EXPORT_SYMBOL(p9_client_getattr_dotl); static int p9_client_statsize(struct p9_wstat *wst, int proto_version) { int ret; /* NOTE: size shouldn't include its own length */ /* size[2] type[2] dev[4] qid[13] */ /* mode[4] atime[4] mtime[4] length[8]*/ /* name[s] uid[s] gid[s] muid[s] */ ret = 2+4+13+4+4+4+8+2+2+2+2; if (wst->name) ret += strlen(wst->name); if (wst->uid) ret += strlen(wst->uid); if (wst->gid) ret += strlen(wst->gid); if (wst->muid) ret += strlen(wst->muid); if ((proto_version == p9_proto_2000u) || (proto_version == p9_proto_2000L)) { ret += 2+4+4+4; /* extension[s] n_uid[4] n_gid[4] n_muid[4] */ if (wst->extension) ret += strlen(wst->extension); } return ret; } int p9_client_wstat(struct p9_fid *fid, struct p9_wstat *wst) { int err; struct p9_req_t *req; struct p9_client *clnt; err = 0; clnt = fid->clnt; wst->size = p9_client_statsize(wst, clnt->proto_version); p9_debug(P9_DEBUG_9P, ">>> TWSTAT fid %d\n", fid->fid); p9_debug(P9_DEBUG_9P, " sz=%x type=%x dev=%x qid=%x.%llx.%x\n" " mode=%8.8x atime=%8.8x mtime=%8.8x length=%llx\n" " name=%s uid=%s gid=%s muid=%s extension=(%s)\n" " uid=%d gid=%d n_muid=%d\n", wst->size, wst->type, wst->dev, wst->qid.type, (unsigned long long)wst->qid.path, wst->qid.version, wst->mode, wst->atime, wst->mtime, (unsigned long long)wst->length, wst->name, wst->uid, wst->gid, wst->muid, wst->extension, from_kuid(&init_user_ns, wst->n_uid), from_kgid(&init_user_ns, wst->n_gid), from_kuid(&init_user_ns, wst->n_muid)); req = p9_client_rpc(clnt, P9_TWSTAT, "dwS", fid->fid, wst->size+2, wst); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } p9_debug(P9_DEBUG_9P, "<<< RWSTAT fid %d\n", fid->fid); p9_tag_remove(clnt, req); error: return err; } EXPORT_SYMBOL(p9_client_wstat); int p9_client_setattr(struct p9_fid *fid, struct p9_iattr_dotl *p9attr) { int err; struct p9_req_t *req; struct p9_client *clnt; err = 0; clnt = fid->clnt; p9_debug(P9_DEBUG_9P, ">>> TSETATTR fid %d\n", fid->fid); p9_debug(P9_DEBUG_9P, " valid=%x mode=%x uid=%d gid=%d size=%lld\n" " atime_sec=%lld atime_nsec=%lld\n" " mtime_sec=%lld mtime_nsec=%lld\n", p9attr->valid, p9attr->mode, from_kuid(&init_user_ns, p9attr->uid), from_kgid(&init_user_ns, p9attr->gid), p9attr->size, p9attr->atime_sec, p9attr->atime_nsec, p9attr->mtime_sec, p9attr->mtime_nsec); req = p9_client_rpc(clnt, P9_TSETATTR, "dI", fid->fid, p9attr); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } p9_debug(P9_DEBUG_9P, "<<< RSETATTR fid %d\n", fid->fid); p9_tag_remove(clnt, req); error: return err; } EXPORT_SYMBOL(p9_client_setattr); int p9_client_statfs(struct p9_fid *fid, struct p9_rstatfs *sb) { int err; struct p9_req_t *req; struct p9_client *clnt; err = 0; clnt = fid->clnt; p9_debug(P9_DEBUG_9P, ">>> TSTATFS fid %d\n", fid->fid); req = p9_client_rpc(clnt, P9_TSTATFS, "d", fid->fid); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } err = p9pdu_readf(&req->rc, clnt->proto_version, "ddqqqqqqd", &sb->type, &sb->bsize, &sb->blocks, &sb->bfree, &sb->bavail, &sb->files, &sb->ffree, &sb->fsid, &sb->namelen); if (err) { trace_9p_protocol_dump(clnt, &req->rc); p9_tag_remove(clnt, req); goto error; } p9_debug(P9_DEBUG_9P, "<<< RSTATFS fid %d type 0x%lx bsize %ld " "blocks %llu bfree %llu bavail %llu files %llu ffree %llu " "fsid %llu namelen %ld\n", fid->fid, (long unsigned int)sb->type, (long int)sb->bsize, sb->blocks, sb->bfree, sb->bavail, sb->files, sb->ffree, sb->fsid, (long int)sb->namelen); p9_tag_remove(clnt, req); error: return err; } EXPORT_SYMBOL(p9_client_statfs); int p9_client_rename(struct p9_fid *fid, struct p9_fid *newdirfid, const char *name) { int err; struct p9_req_t *req; struct p9_client *clnt; err = 0; clnt = fid->clnt; p9_debug(P9_DEBUG_9P, ">>> TRENAME fid %d newdirfid %d name %s\n", fid->fid, newdirfid->fid, name); req = p9_client_rpc(clnt, P9_TRENAME, "dds", fid->fid, newdirfid->fid, name); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } p9_debug(P9_DEBUG_9P, "<<< RRENAME fid %d\n", fid->fid); p9_tag_remove(clnt, req); error: return err; } EXPORT_SYMBOL(p9_client_rename); int p9_client_renameat(struct p9_fid *olddirfid, const char *old_name, struct p9_fid *newdirfid, const char *new_name) { int err; struct p9_req_t *req; struct p9_client *clnt; err = 0; clnt = olddirfid->clnt; p9_debug(P9_DEBUG_9P, ">>> TRENAMEAT olddirfid %d old name %s" " newdirfid %d new name %s\n", olddirfid->fid, old_name, newdirfid->fid, new_name); req = p9_client_rpc(clnt, P9_TRENAMEAT, "dsds", olddirfid->fid, old_name, newdirfid->fid, new_name); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } p9_debug(P9_DEBUG_9P, "<<< RRENAMEAT newdirfid %d new name %s\n", newdirfid->fid, new_name); p9_tag_remove(clnt, req); error: return err; } EXPORT_SYMBOL(p9_client_renameat); /* * An xattrwalk without @attr_name gives the fid for the lisxattr namespace */ struct p9_fid *p9_client_xattrwalk(struct p9_fid *file_fid, const char *attr_name, u64 *attr_size) { int err; struct p9_req_t *req; struct p9_client *clnt; struct p9_fid *attr_fid; err = 0; clnt = file_fid->clnt; attr_fid = p9_fid_create(clnt); if (!attr_fid) { err = -ENOMEM; goto error; } p9_debug(P9_DEBUG_9P, ">>> TXATTRWALK file_fid %d, attr_fid %d name %s\n", file_fid->fid, attr_fid->fid, attr_name); req = p9_client_rpc(clnt, P9_TXATTRWALK, "dds", file_fid->fid, attr_fid->fid, attr_name); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } err = p9pdu_readf(&req->rc, clnt->proto_version, "q", attr_size); if (err) { trace_9p_protocol_dump(clnt, &req->rc); p9_tag_remove(clnt, req); goto clunk_fid; } p9_tag_remove(clnt, req); p9_debug(P9_DEBUG_9P, "<<< RXATTRWALK fid %d size %llu\n", attr_fid->fid, *attr_size); return attr_fid; clunk_fid: p9_client_clunk(attr_fid); attr_fid = NULL; error: if (attr_fid && (attr_fid != file_fid)) p9_fid_destroy(attr_fid); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(p9_client_xattrwalk); int p9_client_xattrcreate(struct p9_fid *fid, const char *name, u64 attr_size, int flags) { int err; struct p9_req_t *req; struct p9_client *clnt; p9_debug(P9_DEBUG_9P, ">>> TXATTRCREATE fid %d name %s size %lld flag %d\n", fid->fid, name, (long long)attr_size, flags); err = 0; clnt = fid->clnt; req = p9_client_rpc(clnt, P9_TXATTRCREATE, "dsqd", fid->fid, name, attr_size, flags); if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } p9_debug(P9_DEBUG_9P, "<<< RXATTRCREATE fid %d\n", fid->fid); p9_tag_remove(clnt, req); error: return err; } EXPORT_SYMBOL_GPL(p9_client_xattrcreate); int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset) { int err, rsize, non_zc = 0; struct p9_client *clnt; struct p9_req_t *req; char *dataptr; struct kvec kv = {.iov_base = data, .iov_len = count}; struct iov_iter to; iov_iter_kvec(&to, READ | ITER_KVEC, &kv, 1, count); p9_debug(P9_DEBUG_9P, ">>> TREADDIR fid %d offset %llu count %d\n", fid->fid, (unsigned long long) offset, count); err = 0; clnt = fid->clnt; rsize = fid->iounit; if (!rsize || rsize > clnt->msize-P9_READDIRHDRSZ) rsize = clnt->msize - P9_READDIRHDRSZ; if (count < rsize) rsize = count; /* Don't bother zerocopy for small IO (< 1024) */ if (clnt->trans_mod->zc_request && rsize > 1024) { /* * response header len is 11 * PDU Header(7) + IO Size (4) */ req = p9_client_zc_rpc(clnt, P9_TREADDIR, &to, NULL, rsize, 0, 11, "dqd", fid->fid, offset, rsize); } else { non_zc = 1; req = p9_client_rpc(clnt, P9_TREADDIR, "dqd", fid->fid, offset, rsize); } if (IS_ERR(req)) { err = PTR_ERR(req); goto error; } err = p9pdu_readf(&req->rc, clnt->proto_version, "D", &count, &dataptr); if (err) { trace_9p_protocol_dump(clnt, &req->rc); goto free_and_error; } if (rsize < count) { pr_err("bogus RREADDIR count (%d > %d)\n", count, rsize); count = rsize; } p9_debug(P9_DEBUG_9P, "<<< RREADDIR count %d\n", count); if (non_zc) memmove(data, dataptr, count); p9_tag_remove(clnt, req); return count; free_and_error: p9_tag_remove(clnt, req); error: return err; } EXPORT_SYMBOL(p9_client_readdir); int p9_client_mknod_dotl(struct p9_fid *fid, const char *name, int mode, dev_t rdev, kgid_t gid, struct p9_qid *qid) { int err; struct p9_client *clnt; struct p9_req_t *req; err = 0; clnt = fid->clnt; p9_debug(P9_DEBUG_9P, ">>> TMKNOD fid %d name %s mode %d major %d " "minor %d\n", fid->fid, name, mode, MAJOR(rdev), MINOR(rdev)); req = p9_client_rpc(clnt, P9_TMKNOD, "dsdddg", fid->fid, name, mode, MAJOR(rdev), MINOR(rdev), gid); if (IS_ERR(req)) return PTR_ERR(req); err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", qid); if (err) { trace_9p_protocol_dump(clnt, &req->rc); goto error; } p9_debug(P9_DEBUG_9P, "<<< RMKNOD qid %x.%llx.%x\n", qid->type, (unsigned long long)qid->path, qid->version); error: p9_tag_remove(clnt, req); return err; } EXPORT_SYMBOL(p9_client_mknod_dotl); int p9_client_mkdir_dotl(struct p9_fid *fid, const char *name, int mode, kgid_t gid, struct p9_qid *qid) { int err; struct p9_client *clnt; struct p9_req_t *req; err = 0; clnt = fid->clnt; p9_debug(P9_DEBUG_9P, ">>> TMKDIR fid %d name %s mode %d gid %d\n", fid->fid, name, mode, from_kgid(&init_user_ns, gid)); req = p9_client_rpc(clnt, P9_TMKDIR, "dsdg", fid->fid, name, mode, gid); if (IS_ERR(req)) return PTR_ERR(req); err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", qid); if (err) { trace_9p_protocol_dump(clnt, &req->rc); goto error; } p9_debug(P9_DEBUG_9P, "<<< RMKDIR qid %x.%llx.%x\n", qid->type, (unsigned long long)qid->path, qid->version); error: p9_tag_remove(clnt, req); return err; } EXPORT_SYMBOL(p9_client_mkdir_dotl); int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status) { int err; struct p9_client *clnt; struct p9_req_t *req; err = 0; clnt = fid->clnt; p9_debug(P9_DEBUG_9P, ">>> TLOCK fid %d type %i flags %d " "start %lld length %lld proc_id %d client_id %s\n", fid->fid, flock->type, flock->flags, flock->start, flock->length, flock->proc_id, flock->client_id); req = p9_client_rpc(clnt, P9_TLOCK, "dbdqqds", fid->fid, flock->type, flock->flags, flock->start, flock->length, flock->proc_id, flock->client_id); if (IS_ERR(req)) return PTR_ERR(req); err = p9pdu_readf(&req->rc, clnt->proto_version, "b", status); if (err) { trace_9p_protocol_dump(clnt, &req->rc); goto error; } p9_debug(P9_DEBUG_9P, "<<< RLOCK status %i\n", *status); error: p9_tag_remove(clnt, req); return err; } EXPORT_SYMBOL(p9_client_lock_dotl); int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *glock) { int err; struct p9_client *clnt; struct p9_req_t *req; err = 0; clnt = fid->clnt; p9_debug(P9_DEBUG_9P, ">>> TGETLOCK fid %d, type %i start %lld " "length %lld proc_id %d client_id %s\n", fid->fid, glock->type, glock->start, glock->length, glock->proc_id, glock->client_id); req = p9_client_rpc(clnt, P9_TGETLOCK, "dbqqds", fid->fid, glock->type, glock->start, glock->length, glock->proc_id, glock->client_id); if (IS_ERR(req)) return PTR_ERR(req); err = p9pdu_readf(&req->rc, clnt->proto_version, "bqqds", &glock->type, &glock->start, &glock->length, &glock->proc_id, &glock->client_id); if (err) { trace_9p_protocol_dump(clnt, &req->rc); goto error; } p9_debug(P9_DEBUG_9P, "<<< RGETLOCK type %i start %lld length %lld " "proc_id %d client_id %s\n", glock->type, glock->start, glock->length, glock->proc_id, glock->client_id); error: p9_tag_remove(clnt, req); return err; } EXPORT_SYMBOL(p9_client_getlock_dotl); int p9_client_readlink(struct p9_fid *fid, char **target) { int err; struct p9_client *clnt; struct p9_req_t *req; err = 0; clnt = fid->clnt; p9_debug(P9_DEBUG_9P, ">>> TREADLINK fid %d\n", fid->fid); req = p9_client_rpc(clnt, P9_TREADLINK, "d", fid->fid); if (IS_ERR(req)) return PTR_ERR(req); err = p9pdu_readf(&req->rc, clnt->proto_version, "s", target); if (err) { trace_9p_protocol_dump(clnt, &req->rc); goto error; } p9_debug(P9_DEBUG_9P, "<<< RREADLINK target %s\n", *target); error: p9_tag_remove(clnt, req); return err; } EXPORT_SYMBOL(p9_client_readlink); int __init p9_client_init(void) { p9_req_cache = KMEM_CACHE(p9_req_t, SLAB_TYPESAFE_BY_RCU); return p9_req_cache ? 0 : -ENOMEM; } void __exit p9_client_exit(void) { kmem_cache_destroy(p9_req_cache); } |
1 1 1 1 1 1 1 1 1 1 1 1 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 | /* * Copyright (C) 2012-2014 Canonical Ltd (Maarten Lankhorst) * * Based on bo.c which bears the following copyright notice, * but is dual licensed: * * Copyright (c) 2006-2009 VMware, Inc., Palo Alto, CA., USA * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sub license, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice (including the * next paragraph) shall be included in all copies or substantial portions * of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * USE OR OTHER DEALINGS IN THE SOFTWARE. * **************************************************************************/ /* * Authors: Thomas Hellstrom <thellstrom-at-vmware-dot-com> */ #include <linux/reservation.h> #include <linux/export.h> /** * DOC: Reservation Object Overview * * The reservation object provides a mechanism to manage shared and * exclusive fences associated with a buffer. A reservation object * can have attached one exclusive fence (normally associated with * write operations) or N shared fences (read operations). The RCU * mechanism is used to protect read access to fences from locked * write-side updates. */ DEFINE_WD_CLASS(reservation_ww_class); EXPORT_SYMBOL(reservation_ww_class); struct lock_class_key reservation_seqcount_class; EXPORT_SYMBOL(reservation_seqcount_class); const char reservation_seqcount_string[] = "reservation_seqcount"; EXPORT_SYMBOL(reservation_seqcount_string); /** * reservation_object_reserve_shared - Reserve space to add a shared * fence to a reservation_object. * @obj: reservation object * * Should be called before reservation_object_add_shared_fence(). Must * be called with obj->lock held. * * RETURNS * Zero for success, or -errno */ int reservation_object_reserve_shared(struct reservation_object *obj) { struct reservation_object_list *fobj, *old; u32 max; old = reservation_object_get_list(obj); if (old && old->shared_max) { if (old->shared_count < old->shared_max) { /* perform an in-place update */ kfree(obj->staged); obj->staged = NULL; return 0; } else max = old->shared_max * 2; } else max = 4; /* * resize obj->staged or allocate if it doesn't exist, * noop if already correct size */ fobj = krealloc(obj->staged, offsetof(typeof(*fobj), shared[max]), GFP_KERNEL); if (!fobj) return -ENOMEM; obj->staged = fobj; fobj->shared_max = max; return 0; } EXPORT_SYMBOL(reservation_object_reserve_shared); static void reservation_object_add_shared_inplace(struct reservation_object *obj, struct reservation_object_list *fobj, struct dma_fence *fence) { struct dma_fence *signaled = NULL; u32 i, signaled_idx; dma_fence_get(fence); preempt_disable(); write_seqcount_begin(&obj->seq); for (i = 0; i < fobj->shared_count; ++i) { struct dma_fence *old_fence; old_fence = rcu_dereference_protected(fobj->shared[i], reservation_object_held(obj)); if (old_fence->context == fence->context) { /* memory barrier is added by write_seqcount_begin */ RCU_INIT_POINTER(fobj->shared[i], fence); write_seqcount_end(&obj->seq); preempt_enable(); dma_fence_put(old_fence); return; } if (!signaled && dma_fence_is_signaled(old_fence)) { signaled = old_fence; signaled_idx = i; } } /* * memory barrier is added by write_seqcount_begin, * fobj->shared_count is protected by this lock too */ if (signaled) { RCU_INIT_POINTER(fobj->shared[signaled_idx], fence); } else { BUG_ON(fobj->shared_count >= fobj->shared_max); RCU_INIT_POINTER(fobj->shared[fobj->shared_count], fence); fobj->shared_count++; } write_seqcount_end(&obj->seq); preempt_enable(); dma_fence_put(signaled); } static void reservation_object_add_shared_replace(struct reservation_object *obj, struct reservation_object_list *old, struct reservation_object_list *fobj, struct dma_fence *fence) { unsigned i, j, k; dma_fence_get(fence); if (!old) { RCU_INIT_POINTER(fobj->shared[0], fence); fobj->shared_count = 1; goto done; } /* * no need to bump fence refcounts, rcu_read access * requires the use of kref_get_unless_zero, and the * references from the old struct are carried over to * the new. */ for (i = 0, j = 0, k = fobj->shared_max; i < old->shared_count; ++i) { struct dma_fence *check; check = rcu_dereference_protected(old->shared[i], reservation_object_held(obj)); if (check->context == fence->context || dma_fence_is_signaled(check)) RCU_INIT_POINTER(fobj->shared[--k], check); else RCU_INIT_POINTER(fobj->shared[j++], check); } fobj->shared_count = j; RCU_INIT_POINTER(fobj->shared[fobj->shared_count], fence); fobj->shared_count++; done: preempt_disable(); write_seqcount_begin(&obj->seq); /* * RCU_INIT_POINTER can be used here, * seqcount provides the necessary barriers */ RCU_INIT_POINTER(obj->fence, fobj); write_seqcount_end(&obj->seq); preempt_enable(); if (!old) return; /* Drop the references to the signaled fences */ for (i = k; i < fobj->shared_max; ++i) { struct dma_fence *f; f = rcu_dereference_protected(fobj->shared[i], reservation_object_held(obj)); dma_fence_put(f); } kfree_rcu(old, rcu); } /** * reservation_object_add_shared_fence - Add a fence to a shared slot * @obj: the reservation object * @fence: the shared fence to add * * Add a fence to a shared slot, obj->lock must be held, and * reservation_object_reserve_shared() has been called. */ void reservation_object_add_shared_fence(struct reservation_object *obj, struct dma_fence *fence) { struct reservation_object_list *old, *fobj = obj->staged; old = reservation_object_get_list(obj); obj->staged = NULL; if (!fobj) reservation_object_add_shared_inplace(obj, old, fence); else reservation_object_add_shared_replace(obj, old, fobj, fence); } EXPORT_SYMBOL(reservation_object_add_shared_fence); /** * reservation_object_add_excl_fence - Add an exclusive fence. * @obj: the reservation object * @fence: the shared fence to add * * Add a fence to the exclusive slot. The obj->lock must be held. */ void reservation_object_add_excl_fence(struct reservation_object *obj, struct dma_fence *fence) { struct dma_fence *old_fence = reservation_object_get_excl(obj); struct reservation_object_list *old; u32 i = 0; old = reservation_object_get_list(obj); if (old) i = old->shared_count; if (fence) dma_fence_get(fence); preempt_disable(); write_seqcount_begin(&obj->seq); /* write_seqcount_begin provides the necessary memory barrier */ RCU_INIT_POINTER(obj->fence_excl, fence); if (old) old->shared_count = 0; write_seqcount_end(&obj->seq); preempt_enable(); /* inplace update, no shared fences */ while (i--) dma_fence_put(rcu_dereference_protected(old->shared[i], reservation_object_held(obj))); dma_fence_put(old_fence); } EXPORT_SYMBOL(reservation_object_add_excl_fence); /** * reservation_object_copy_fences - Copy all fences from src to dst. * @dst: the destination reservation object * @src: the source reservation object * * Copy all fences from src to dst. dst-lock must be held. */ int reservation_object_copy_fences(struct reservation_object *dst, struct reservation_object *src) { struct reservation_object_list *src_list, *dst_list; struct dma_fence *old, *new; size_t size; unsigned i; rcu_read_lock(); src_list = rcu_dereference(src->fence); retry: if (src_list) { unsigned shared_count = src_list->shared_count; size = offsetof(typeof(*src_list), shared[shared_count]); rcu_read_unlock(); dst_list = kmalloc(size, GFP_KERNEL); if (!dst_list) return -ENOMEM; rcu_read_lock(); src_list = rcu_dereference(src->fence); if (!src_list || src_list->shared_count > shared_count) { kfree(dst_list); goto retry; } dst_list->shared_count = 0; dst_list->shared_max = shared_count; for (i = 0; i < src_list->shared_count; ++i) { struct dma_fence *fence; fence = rcu_dereference(src_list->shared[i]); if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) continue; if (!dma_fence_get_rcu(fence)) { kfree(dst_list); src_list = rcu_dereference(src->fence); goto retry; } if (dma_fence_is_signaled(fence)) { dma_fence_put(fence); continue; } rcu_assign_pointer(dst_list->shared[dst_list->shared_count++], fence); } } else { dst_list = NULL; } new = dma_fence_get_rcu_safe(&src->fence_excl); rcu_read_unlock(); kfree(dst->staged); dst->staged = NULL; src_list = reservation_object_get_list(dst); old = reservation_object_get_excl(dst); preempt_disable(); write_seqcount_begin(&dst->seq); /* write_seqcount_begin provides the necessary memory barrier */ RCU_INIT_POINTER(dst->fence_excl, new); RCU_INIT_POINTER(dst->fence, dst_list); write_seqcount_end(&dst->seq); preempt_enable(); if (src_list) kfree_rcu(src_list, rcu); dma_fence_put(old); return 0; } EXPORT_SYMBOL(reservation_object_copy_fences); /** * reservation_object_get_fences_rcu - Get an object's shared and exclusive * fences without update side lock held * @obj: the reservation object * @pfence_excl: the returned exclusive fence (or NULL) * @pshared_count: the number of shared fences returned * @pshared: the array of shared fence ptrs returned (array is krealloc'd to * the required size, and must be freed by caller) * * Retrieve all fences from the reservation object. If the pointer for the * exclusive fence is not specified the fence is put into the array of the * shared fences as well. Returns either zero or -ENOMEM. */ int reservation_object_get_fences_rcu(struct reservation_object *obj, struct dma_fence **pfence_excl, unsigned *pshared_count, struct dma_fence ***pshared) { struct dma_fence **shared = NULL; struct dma_fence *fence_excl; unsigned int shared_count; int ret = 1; do { struct reservation_object_list *fobj; unsigned int i, seq; size_t sz = 0; shared_count = i = 0; rcu_read_lock(); seq = read_seqcount_begin(&obj->seq); fence_excl = rcu_dereference(obj->fence_excl); if (fence_excl && !dma_fence_get_rcu(fence_excl)) goto unlock; fobj = rcu_dereference(obj->fence); if (fobj) sz += sizeof(*shared) * fobj->shared_max; if (!pfence_excl && fence_excl) sz += sizeof(*shared); if (sz) { struct dma_fence **nshared; nshared = krealloc(shared, sz, GFP_NOWAIT | __GFP_NOWARN); if (!nshared) { rcu_read_unlock(); dma_fence_put(fence_excl); fence_excl = NULL; nshared = krealloc(shared, sz, GFP_KERNEL); if (nshared) { shared = nshared; continue; } ret = -ENOMEM; break; } shared = nshared; shared_count = fobj ? fobj->shared_count : 0; for (i = 0; i < shared_count; ++i) { shared[i] = rcu_dereference(fobj->shared[i]); if (!dma_fence_get_rcu(shared[i])) break; } if (!pfence_excl && fence_excl) { shared[i] = fence_excl; fence_excl = NULL; ++i; ++shared_count; } } if (i != shared_count || read_seqcount_retry(&obj->seq, seq)) { while (i--) dma_fence_put(shared[i]); dma_fence_put(fence_excl); goto unlock; } ret = 0; unlock: rcu_read_unlock(); } while (ret); if (!shared_count) { kfree(shared); shared = NULL; } *pshared_count = shared_count; *pshared = shared; if (pfence_excl) *pfence_excl = fence_excl; return ret; } EXPORT_SYMBOL_GPL(reservation_object_get_fences_rcu); /** * reservation_object_wait_timeout_rcu - Wait on reservation's objects * shared and/or exclusive fences. * @obj: the reservation object * @wait_all: if true, wait on all fences, else wait on just exclusive fence * @intr: if true, do interruptible wait * @timeout: timeout value in jiffies or zero to return immediately * * RETURNS * Returns -ERESTARTSYS if interrupted, 0 if the wait timed out, or * greater than zer on success. */ long reservation_object_wait_timeout_rcu(struct reservation_object *obj, bool wait_all, bool intr, unsigned long timeout) { struct dma_fence *fence; unsigned seq, shared_count; long ret = timeout ? timeout : 1; int i; retry: shared_count = 0; seq = read_seqcount_begin(&obj->seq); rcu_read_lock(); i = -1; fence = rcu_dereference(obj->fence_excl); if (fence && !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) { if (!dma_fence_get_rcu(fence)) goto unlock_retry; if (dma_fence_is_signaled(fence)) { dma_fence_put(fence); fence = NULL; } } else { fence = NULL; } if (wait_all) { struct reservation_object_list *fobj = rcu_dereference(obj->fence); if (fobj) shared_count = fobj->shared_count; for (i = 0; !fence && i < shared_count; ++i) { struct dma_fence *lfence = rcu_dereference(fobj->shared[i]); if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &lfence->flags)) continue; if (!dma_fence_get_rcu(lfence)) goto unlock_retry; if (dma_fence_is_signaled(lfence)) { dma_fence_put(lfence); continue; } fence = lfence; break; } } rcu_read_unlock(); if (fence) { if (read_seqcount_retry(&obj->seq, seq)) { dma_fence_put(fence); goto retry; } ret = dma_fence_wait_timeout(fence, intr, ret); dma_fence_put(fence); if (ret > 0 && wait_all && (i + 1 < shared_count)) goto retry; } return ret; unlock_retry: rcu_read_unlock(); goto retry; } EXPORT_SYMBOL_GPL(reservation_object_wait_timeout_rcu); static inline int reservation_object_test_signaled_single(struct dma_fence *passed_fence) { struct dma_fence *fence, *lfence = passed_fence; int ret = 1; if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &lfence->flags)) { fence = dma_fence_get_rcu(lfence); if (!fence) return -1; ret = !!dma_fence_is_signaled(fence); dma_fence_put(fence); } return ret; } /** * reservation_object_test_signaled_rcu - Test if a reservation object's * fences have been signaled. * @obj: the reservation object * @test_all: if true, test all fences, otherwise only test the exclusive * fence * * RETURNS * true if all fences signaled, else false */ bool reservation_object_test_signaled_rcu(struct reservation_object *obj, bool test_all) { unsigned seq, shared_count; int ret; rcu_read_lock(); retry: ret = true; shared_count = 0; seq = read_seqcount_begin(&obj->seq); if (test_all) { unsigned i; struct reservation_object_list *fobj = rcu_dereference(obj->fence); if (fobj) shared_count = fobj->shared_count; for (i = 0; i < shared_count; ++i) { struct dma_fence *fence = rcu_dereference(fobj->shared[i]); ret = reservation_object_test_signaled_single(fence); if (ret < 0) goto retry; else if (!ret) break; } if (read_seqcount_retry(&obj->seq, seq)) goto retry; } if (!shared_count) { struct dma_fence *fence_excl = rcu_dereference(obj->fence_excl); if (fence_excl) { ret = reservation_object_test_signaled_single( fence_excl); if (ret < 0) goto retry; if (read_seqcount_retry(&obj->seq, seq)) goto retry; } } rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(reservation_object_test_signaled_rcu); |
30 16 16 5 12 1 1 1 12 16 16 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 | // SPDX-License-Identifier: GPL-2.0 /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Basic Transport Functions exploiting Infiniband API * * Copyright IBM Corp. 2016 * * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> */ #include <linux/socket.h> #include <linux/if_vlan.h> #include <linux/random.h> #include <linux/workqueue.h> #include <net/tcp.h> #include <net/sock.h> #include <rdma/ib_verbs.h> #include <rdma/ib_cache.h> #include "smc.h" #include "smc_clc.h" #include "smc_core.h" #include "smc_ib.h" #include "smc_wr.h" #include "smc_llc.h" #include "smc_cdc.h" #include "smc_close.h" #include "smc_ism.h" #define SMC_LGR_NUM_INCR 256 #define SMC_LGR_FREE_DELAY_SERV (600 * HZ) #define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10 * HZ) #define SMC_LGR_FREE_DELAY_FAST (8 * HZ) static struct smc_lgr_list smc_lgr_list = { /* established link groups */ .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock), .list = LIST_HEAD_INIT(smc_lgr_list.list), .num = 0, }; static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, struct smc_buf_desc *buf_desc); static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) { /* client link group creation always follows the server link group * creation. For client use a somewhat higher removal delay time, * otherwise there is a risk of out-of-sync link groups. */ mod_delayed_work(system_wq, &lgr->free_work, (!lgr->is_smcd && lgr->role == SMC_CLNT) ? SMC_LGR_FREE_DELAY_CLNT : SMC_LGR_FREE_DELAY_SERV); } void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr) { mod_delayed_work(system_wq, &lgr->free_work, SMC_LGR_FREE_DELAY_FAST); } /* Register connection's alert token in our lookup structure. * To use rbtrees we have to implement our own insert core. * Requires @conns_lock * @smc connection to register * Returns 0 on success, != otherwise. */ static void smc_lgr_add_alert_token(struct smc_connection *conn) { struct rb_node **link, *parent = NULL; u32 token = conn->alert_token_local; link = &conn->lgr->conns_all.rb_node; while (*link) { struct smc_connection *cur = rb_entry(*link, struct smc_connection, alert_node); parent = *link; if (cur->alert_token_local > token) link = &parent->rb_left; else link = &parent->rb_right; } /* Put the new node there */ rb_link_node(&conn->alert_node, parent, link); rb_insert_color(&conn->alert_node, &conn->lgr->conns_all); } /* Register connection in link group by assigning an alert token * registered in a search tree. * Requires @conns_lock * Note that '0' is a reserved value and not assigned. */ static void smc_lgr_register_conn(struct smc_connection *conn) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); static atomic_t nexttoken = ATOMIC_INIT(0); /* find a new alert_token_local value not yet used by some connection * in this link group */ sock_hold(&smc->sk); /* sock_put in smc_lgr_unregister_conn() */ while (!conn->alert_token_local) { conn->alert_token_local = atomic_inc_return(&nexttoken); if (smc_lgr_find_conn(conn->alert_token_local, conn->lgr)) conn->alert_token_local = 0; } smc_lgr_add_alert_token(conn); conn->lgr->conns_num++; } /* Unregister connection and reset the alert token of the given connection< */ static void __smc_lgr_unregister_conn(struct smc_connection *conn) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); struct smc_link_group *lgr = conn->lgr; rb_erase(&conn->alert_node, &lgr->conns_all); lgr->conns_num--; conn->alert_token_local = 0; conn->lgr = NULL; sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */ } /* Unregister connection from lgr */ static void smc_lgr_unregister_conn(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; if (!lgr) return; write_lock_bh(&lgr->conns_lock); if (conn->alert_token_local) { __smc_lgr_unregister_conn(conn); } write_unlock_bh(&lgr->conns_lock); } /* Send delete link, either as client to request the initiation * of the DELETE LINK sequence from server; or as server to * initiate the delete processing. See smc_llc_rx_delete_link(). */ static int smc_link_send_delete(struct smc_link *lnk) { if (lnk->state == SMC_LNK_ACTIVE && !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, true)) { smc_llc_link_deleting(lnk); return 0; } return -ENOTCONN; } static void smc_lgr_free_work(struct work_struct *work) { struct smc_link_group *lgr = container_of(to_delayed_work(work), struct smc_link_group, free_work); bool conns; spin_lock_bh(&smc_lgr_list.lock); if (list_empty(&lgr->list)) goto free; read_lock_bh(&lgr->conns_lock); conns = RB_EMPTY_ROOT(&lgr->conns_all); read_unlock_bh(&lgr->conns_lock); if (!conns) { /* number of lgr connections is no longer zero */ spin_unlock_bh(&smc_lgr_list.lock); return; } list_del_init(&lgr->list); /* remove from smc_lgr_list */ free: spin_unlock_bh(&smc_lgr_list.lock); if (!lgr->is_smcd && !lgr->terminating) { /* try to send del link msg, on error free lgr immediately */ if (!smc_link_send_delete(&lgr->lnk[SMC_SINGLE_LINK])) { /* reschedule in case we never receive a response */ smc_lgr_schedule_free_work(lgr); return; } } if (!delayed_work_pending(&lgr->free_work)) { struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE) smc_llc_link_inactive(lnk); smc_lgr_free(lgr); } } /* create a new SMC link group */ static int smc_lgr_create(struct smc_sock *smc, bool is_smcd, struct smc_ib_device *smcibdev, u8 ibport, char *peer_systemid, unsigned short vlan_id, struct smcd_dev *smcismdev, u64 peer_gid) { struct smc_link_group *lgr; struct smc_link *lnk; u8 rndvec[3]; int rc = 0; int i; if (is_smcd && vlan_id) { rc = smc_ism_get_vlan(smcismdev, vlan_id); if (rc) goto out; } lgr = kzalloc(sizeof(*lgr), GFP_KERNEL); if (!lgr) { rc = -ENOMEM; goto out; } lgr->is_smcd = is_smcd; lgr->sync_err = 0; lgr->vlan_id = vlan_id; rwlock_init(&lgr->sndbufs_lock); rwlock_init(&lgr->rmbs_lock); rwlock_init(&lgr->conns_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { INIT_LIST_HEAD(&lgr->sndbufs[i]); INIT_LIST_HEAD(&lgr->rmbs[i]); } smc_lgr_list.num += SMC_LGR_NUM_INCR; memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE); INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); lgr->conns_all = RB_ROOT; if (is_smcd) { /* SMC-D specific settings */ lgr->peer_gid = peer_gid; lgr->smcd = smcismdev; } else { /* SMC-R specific settings */ lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN); lnk = &lgr->lnk[SMC_SINGLE_LINK]; /* initialize link */ lnk->state = SMC_LNK_ACTIVATING; lnk->link_id = SMC_SINGLE_LINK; lnk->smcibdev = smcibdev; lnk->ibport = ibport; lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu; if (!smcibdev->initialized) smc_ib_setup_per_ibdev(smcibdev); get_random_bytes(rndvec, sizeof(rndvec)); lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16); rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport, vlan_id, lnk->gid, &lnk->sgid_index); if (rc) goto free_lgr; rc = smc_llc_link_init(lnk); if (rc) goto free_lgr; rc = smc_wr_alloc_link_mem(lnk); if (rc) goto clear_llc_lnk; rc = smc_ib_create_protection_domain(lnk); if (rc) goto free_link_mem; rc = smc_ib_create_queue_pair(lnk); if (rc) goto dealloc_pd; rc = smc_wr_create_link(lnk); if (rc) goto destroy_qp; } smc->conn.lgr = lgr; spin_lock_bh(&smc_lgr_list.lock); list_add(&lgr->list, &smc_lgr_list.list); spin_unlock_bh(&smc_lgr_list.lock); return 0; destroy_qp: smc_ib_destroy_queue_pair(lnk); dealloc_pd: smc_ib_dealloc_protection_domain(lnk); free_link_mem: smc_wr_free_link_mem(lnk); clear_llc_lnk: smc_llc_link_clear(lnk); free_lgr: kfree(lgr); out: return rc; } static void smc_buf_unuse(struct smc_connection *conn, struct smc_link_group *lgr) { if (conn->sndbuf_desc) conn->sndbuf_desc->used = 0; if (conn->rmb_desc) { if (!conn->rmb_desc->regerr) { conn->rmb_desc->reused = 1; conn->rmb_desc->used = 0; } else { /* buf registration failed, reuse not possible */ write_lock_bh(&lgr->rmbs_lock); list_del(&conn->rmb_desc->list); write_unlock_bh(&lgr->rmbs_lock); smc_buf_free(lgr, true, conn->rmb_desc); } } } /* remove a finished connection from its link group */ void smc_conn_free(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; if (!lgr) return; if (lgr->is_smcd) { smc_ism_unset_conn(conn); tasklet_kill(&conn->rx_tsklet); } else { smc_cdc_tx_dismiss_slots(conn); } smc_buf_unuse(conn, lgr); /* allow buffer reuse */ smc_lgr_unregister_conn(conn); /* unsets conn->lgr */ if (!lgr->conns_num) smc_lgr_schedule_free_work(lgr); } static void smc_link_clear(struct smc_link *lnk) { lnk->peer_qpn = 0; smc_llc_link_clear(lnk); smc_ib_modify_qp_reset(lnk); smc_wr_free_link(lnk); smc_ib_destroy_queue_pair(lnk); smc_ib_dealloc_protection_domain(lnk); smc_wr_free_link_mem(lnk); } static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, struct smc_buf_desc *buf_desc) { struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; if (is_rmb) { if (buf_desc->mr_rx[SMC_SINGLE_LINK]) smc_ib_put_memory_region( buf_desc->mr_rx[SMC_SINGLE_LINK]); smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc, DMA_FROM_DEVICE); } else { smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc, DMA_TO_DEVICE); } sg_free_table(&buf_desc->sgt[SMC_SINGLE_LINK]); if (buf_desc->pages) __free_pages(buf_desc->pages, buf_desc->order); kfree(buf_desc); } static void smcd_buf_free(struct smc_link_group *lgr, bool is_dmb, struct smc_buf_desc *buf_desc) { if (is_dmb) { /* restore original buf len */ buf_desc->len += sizeof(struct smcd_cdc_msg); smc_ism_unregister_dmb(lgr->smcd, buf_desc); } else { kfree(buf_desc->cpu_addr); } kfree(buf_desc); } static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, struct smc_buf_desc *buf_desc) { if (lgr->is_smcd) smcd_buf_free(lgr, is_rmb, buf_desc); else smcr_buf_free(lgr, is_rmb, buf_desc); } static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb) { struct smc_buf_desc *buf_desc, *bf_desc; struct list_head *buf_list; int i; for (i = 0; i < SMC_RMBE_SIZES; i++) { if (is_rmb) buf_list = &lgr->rmbs[i]; else buf_list = &lgr->sndbufs[i]; list_for_each_entry_safe(buf_desc, bf_desc, buf_list, list) { list_del(&buf_desc->list); smc_buf_free(lgr, is_rmb, buf_desc); } } } static void smc_lgr_free_bufs(struct smc_link_group *lgr) { /* free send buffers */ __smc_lgr_free_bufs(lgr, false); /* free rmbs */ __smc_lgr_free_bufs(lgr, true); } /* remove a link group */ void smc_lgr_free(struct smc_link_group *lgr) { smc_lgr_free_bufs(lgr); if (lgr->is_smcd) smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); else smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); kfree(lgr); } void smc_lgr_forget(struct smc_link_group *lgr) { spin_lock_bh(&smc_lgr_list.lock); /* do not use this link group for new connections */ if (!list_empty(&lgr->list)) list_del_init(&lgr->list); spin_unlock_bh(&smc_lgr_list.lock); } /* terminate linkgroup abnormally */ static void __smc_lgr_terminate(struct smc_link_group *lgr) { struct smc_connection *conn; struct smc_sock *smc; struct rb_node *node; if (lgr->terminating) return; /* lgr already terminating */ lgr->terminating = 1; if (!list_empty(&lgr->list)) /* forget lgr */ list_del_init(&lgr->list); if (!lgr->is_smcd) smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); write_lock_bh(&lgr->conns_lock); node = rb_first(&lgr->conns_all); while (node) { conn = rb_entry(node, struct smc_connection, alert_node); smc = container_of(conn, struct smc_sock, conn); sock_hold(&smc->sk); /* sock_put in close work */ conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; __smc_lgr_unregister_conn(conn); write_unlock_bh(&lgr->conns_lock); if (!schedule_work(&conn->close_work)) sock_put(&smc->sk); write_lock_bh(&lgr->conns_lock); node = rb_first(&lgr->conns_all); } write_unlock_bh(&lgr->conns_lock); if (!lgr->is_smcd) wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait); smc_lgr_schedule_free_work(lgr); } void smc_lgr_terminate(struct smc_link_group *lgr) { spin_lock_bh(&smc_lgr_list.lock); __smc_lgr_terminate(lgr); spin_unlock_bh(&smc_lgr_list.lock); } /* Called when IB port is terminated */ void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) { struct smc_link_group *lgr, *l; spin_lock_bh(&smc_lgr_list.lock); list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { if (!lgr->is_smcd && lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) __smc_lgr_terminate(lgr); } spin_unlock_bh(&smc_lgr_list.lock); } /* Called when SMC-D device is terminated or peer is lost */ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid) { struct smc_link_group *lgr, *l; LIST_HEAD(lgr_free_list); /* run common cleanup function and build free list */ spin_lock_bh(&smc_lgr_list.lock); list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { if (lgr->is_smcd && lgr->smcd == dev && (!peer_gid || lgr->peer_gid == peer_gid) && !list_empty(&lgr->list)) { __smc_lgr_terminate(lgr); list_move(&lgr->list, &lgr_free_list); } } spin_unlock_bh(&smc_lgr_list.lock); /* cancel the regular free workers and actually free lgrs */ list_for_each_entry_safe(lgr, l, &lgr_free_list, list) { list_del_init(&lgr->list); cancel_delayed_work_sync(&lgr->free_work); smc_lgr_free(lgr); } } /* Determine vlan of internal TCP socket. * @vlan_id: address to store the determined vlan id into */ int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id) { struct dst_entry *dst = sk_dst_get(clcsock->sk); struct net_device *ndev; int i, nest_lvl, rc = 0; *vlan_id = 0; if (!dst) { rc = -ENOTCONN; goto out; } if (!dst->dev) { rc = -ENODEV; goto out_rel; } ndev = dst->dev; if (is_vlan_dev(ndev)) { *vlan_id = vlan_dev_vlan_id(ndev); goto out_rel; } rtnl_lock(); nest_lvl = dev_get_nest_level(ndev); for (i = 0; i < nest_lvl; i++) { struct list_head *lower = &ndev->adj_list.lower; if (list_empty(lower)) break; lower = lower->next; ndev = (struct net_device *)netdev_lower_get_next(ndev, &lower); if (is_vlan_dev(ndev)) { *vlan_id = vlan_dev_vlan_id(ndev); break; } } rtnl_unlock(); out_rel: dst_release(dst); out: return rc; } static bool smcr_lgr_match(struct smc_link_group *lgr, struct smc_clc_msg_local *lcl, enum smc_lgr_role role) { return !memcmp(lgr->peer_systemid, lcl->id_for_peer, SMC_SYSTEMID_LEN) && !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid, SMC_GID_SIZE) && !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac, sizeof(lcl->mac)) && lgr->role == role; } static bool smcd_lgr_match(struct smc_link_group *lgr, struct smcd_dev *smcismdev, u64 peer_gid) { return lgr->peer_gid == peer_gid && lgr->smcd == smcismdev; } /* create a new SMC connection (and a new link group if necessary) */ int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, struct smc_ib_device *smcibdev, u8 ibport, struct smc_clc_msg_local *lcl, struct smcd_dev *smcd, u64 peer_gid) { struct smc_connection *conn = &smc->conn; int local_contact = SMC_FIRST_CONTACT; struct smc_link_group *lgr; unsigned short vlan_id; enum smc_lgr_role role; int rc = 0; role = smc->listen_smc ? SMC_SERV : SMC_CLNT; rc = smc_vlan_by_tcpsk(smc->clcsock, &vlan_id); if (rc) return rc; if ((role == SMC_CLNT) && srv_first_contact) /* create new link group as well */ goto create; /* determine if an existing link group can be reused */ spin_lock_bh(&smc_lgr_list.lock); list_for_each_entry(lgr, &smc_lgr_list.list, list) { write_lock_bh(&lgr->conns_lock); if ((is_smcd ? smcd_lgr_match(lgr, smcd, peer_gid) : smcr_lgr_match(lgr, lcl, role)) && !lgr->sync_err && lgr->vlan_id == vlan_id && (role == SMC_CLNT || (lgr->conns_num < SMC_RMBS_PER_LGR_MAX && !bitmap_full(lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX)))) { /* link group found */ local_contact = SMC_REUSE_CONTACT; conn->lgr = lgr; smc_lgr_register_conn(conn); /* add smc conn to lgr */ if (delayed_work_pending(&lgr->free_work)) cancel_delayed_work(&lgr->free_work); write_unlock_bh(&lgr->conns_lock); break; } write_unlock_bh(&lgr->conns_lock); } spin_unlock_bh(&smc_lgr_list.lock); if (role == SMC_CLNT && !srv_first_contact && (local_contact == SMC_FIRST_CONTACT)) { /* Server reuses a link group, but Client wants to start * a new one * send out_of_sync decline, reason synchr. error */ return -ENOLINK; } create: if (local_contact == SMC_FIRST_CONTACT) { rc = smc_lgr_create(smc, is_smcd, smcibdev, ibport, lcl->id_for_peer, vlan_id, smcd, peer_gid); if (rc) goto out; smc_lgr_register_conn(conn); /* add smc conn to lgr */ } conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; conn->urg_state = SMC_URG_READ; if (is_smcd) { conn->rx_off = sizeof(struct smcd_cdc_msg); smcd_cdc_rx_init(conn); /* init tasklet for this conn */ } #ifndef KERNEL_HAS_ATOMIC64 spin_lock_init(&conn->acurs_lock); #endif out: return rc ? rc : local_contact; } /* convert the RMB size into the compressed notation - minimum 16K. * In contrast to plain ilog2, this rounds towards the next power of 2, * so the socket application gets at least its desired sndbuf / rcvbuf size. */ static u8 smc_compress_bufsize(int size) { u8 compressed; if (size <= SMC_BUF_MIN_SIZE) return 0; size = (size - 1) >> 14; compressed = ilog2(size) + 1; if (compressed >= SMC_RMBE_SIZES) compressed = SMC_RMBE_SIZES - 1; return compressed; } /* convert the RMB size from compressed notation into integer */ int smc_uncompress_bufsize(u8 compressed) { u32 size; size = 0x00000001 << (((int)compressed) + 14); return (int)size; } /* try to reuse a sndbuf or rmb description slot for a certain * buffer size; if not available, return NULL */ static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize, rwlock_t *lock, struct list_head *buf_list) { struct smc_buf_desc *buf_slot; read_lock_bh(lock); list_for_each_entry(buf_slot, buf_list, list) { if (cmpxchg(&buf_slot->used, 0, 1) == 0) { read_unlock_bh(lock); return buf_slot; } } read_unlock_bh(lock); return NULL; } /* one of the conditions for announcing a receiver's current window size is * that it "results in a minimum increase in the window size of 10% of the * receive buffer space" [RFC7609] */ static inline int smc_rmb_wnd_update_limit(int rmbe_size) { return max_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); } static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, bool is_rmb, int bufsize) { struct smc_buf_desc *buf_desc; struct smc_link *lnk; int rc; /* try to alloc a new buffer */ buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); if (!buf_desc) return ERR_PTR(-ENOMEM); buf_desc->order = get_order(bufsize); buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_NOMEMALLOC | __GFP_COMP | __GFP_NORETRY | __GFP_ZERO, buf_desc->order); if (!buf_desc->pages) { kfree(buf_desc); return ERR_PTR(-EAGAIN); } buf_desc->cpu_addr = (void *)page_address(buf_desc->pages); /* build the sg table from the pages */ lnk = &lgr->lnk[SMC_SINGLE_LINK]; rc = sg_alloc_table(&buf_desc->sgt[SMC_SINGLE_LINK], 1, GFP_KERNEL); if (rc) { smc_buf_free(lgr, is_rmb, buf_desc); return ERR_PTR(rc); } sg_set_buf(buf_desc->sgt[SMC_SINGLE_LINK].sgl, buf_desc->cpu_addr, bufsize); /* map sg table to DMA address */ rc = smc_ib_buf_map_sg(lnk->smcibdev, buf_desc, is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); /* SMC protocol depends on mapping to one DMA address only */ if (rc != 1) { smc_buf_free(lgr, is_rmb, buf_desc); return ERR_PTR(-EAGAIN); } /* create a new memory region for the RMB */ if (is_rmb) { rc = smc_ib_get_memory_region(lnk->roce_pd, IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE, buf_desc); if (rc) { smc_buf_free(lgr, is_rmb, buf_desc); return ERR_PTR(rc); } } buf_desc->len = bufsize; return buf_desc; } #define SMCD_DMBE_SIZES 6 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr, bool is_dmb, int bufsize) { struct smc_buf_desc *buf_desc; int rc; if (smc_compress_bufsize(bufsize) > SMCD_DMBE_SIZES) return ERR_PTR(-EAGAIN); /* try to alloc a new DMB */ buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); if (!buf_desc) return ERR_PTR(-ENOMEM); if (is_dmb) { rc = smc_ism_register_dmb(lgr, bufsize, buf_desc); if (rc) { kfree(buf_desc); return ERR_PTR(-EAGAIN); } buf_desc->pages = virt_to_page(buf_desc->cpu_addr); /* CDC header stored in buf. So, pretend it was smaller */ buf_desc->len = bufsize - sizeof(struct smcd_cdc_msg); } else { buf_desc->cpu_addr = kzalloc(bufsize, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC); if (!buf_desc->cpu_addr) { kfree(buf_desc); return ERR_PTR(-EAGAIN); } buf_desc->len = bufsize; } return buf_desc; } static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) { struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM); struct smc_connection *conn = &smc->conn; struct smc_link_group *lgr = conn->lgr; struct list_head *buf_list; int bufsize, bufsize_short; int sk_buf_size; rwlock_t *lock; if (is_rmb) /* use socket recv buffer size (w/o overhead) as start value */ sk_buf_size = smc->sk.sk_rcvbuf / 2; else /* use socket send buffer size (w/o overhead) as start value */ sk_buf_size = smc->sk.sk_sndbuf / 2; for (bufsize_short = smc_compress_bufsize(sk_buf_size); bufsize_short >= 0; bufsize_short--) { if (is_rmb) { lock = &lgr->rmbs_lock; buf_list = &lgr->rmbs[bufsize_short]; } else { lock = &lgr->sndbufs_lock; buf_list = &lgr->sndbufs[bufsize_short]; } bufsize = smc_uncompress_bufsize(bufsize_short); if ((1 << get_order(bufsize)) > SG_MAX_SINGLE_ALLOC) continue; /* check for reusable slot in the link group */ buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list); if (buf_desc) { memset(buf_desc->cpu_addr, 0, bufsize); break; /* found reusable slot */ } if (is_smcd) buf_desc = smcd_new_buf_create(lgr, is_rmb, bufsize); else buf_desc = smcr_new_buf_create(lgr, is_rmb, bufsize); if (PTR_ERR(buf_desc) == -ENOMEM) break; if (IS_ERR(buf_desc)) continue; buf_desc->used = 1; write_lock_bh(lock); list_add(&buf_desc->list, buf_list); write_unlock_bh(lock); break; /* found */ } if (IS_ERR(buf_desc)) return -ENOMEM; if (is_rmb) { conn->rmb_desc = buf_desc; conn->rmbe_size_short = bufsize_short; smc->sk.sk_rcvbuf = bufsize * 2; atomic_set(&conn->bytes_to_rcv, 0); conn->rmbe_update_limit = smc_rmb_wnd_update_limit(buf_desc->len); if (is_smcd) smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */ } else { conn->sndbuf_desc = buf_desc; smc->sk.sk_sndbuf = bufsize * 2; atomic_set(&conn->sndbuf_space, bufsize); } return 0; } void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; if (!conn->lgr || conn->lgr->is_smcd) return; smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, conn->sndbuf_desc, DMA_TO_DEVICE); } void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; if (!conn->lgr || conn->lgr->is_smcd) return; smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, conn->sndbuf_desc, DMA_TO_DEVICE); } void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; if (!conn->lgr || conn->lgr->is_smcd) return; smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, conn->rmb_desc, DMA_FROM_DEVICE); } void smc_rmb_sync_sg_for_device(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; if (!conn->lgr || conn->lgr->is_smcd) return; smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, conn->rmb_desc, DMA_FROM_DEVICE); } /* create the send and receive buffer for an SMC socket; * receive buffers are called RMBs; * (even though the SMC protocol allows more than one RMB-element per RMB, * the Linux implementation uses just one RMB-element per RMB, i.e. uses an * extra RMB for every connection in a link group */ int smc_buf_create(struct smc_sock *smc, bool is_smcd) { int rc; /* create send buffer */ rc = __smc_buf_create(smc, is_smcd, false); if (rc) return rc; /* create rmb */ rc = __smc_buf_create(smc, is_smcd, true); if (rc) smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc); return rc; } static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr) { int i; for_each_clear_bit(i, lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) { if (!test_and_set_bit(i, lgr->rtokens_used_mask)) return i; } return -ENOSPC; } /* add a new rtoken from peer */ int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey) { u64 dma_addr = be64_to_cpu(nw_vaddr); u32 rkey = ntohl(nw_rkey); int i; for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) && (lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr == dma_addr) && test_bit(i, lgr->rtokens_used_mask)) { /* already in list */ return i; } } i = smc_rmb_reserve_rtoken_idx(lgr); if (i < 0) return i; lgr->rtokens[i][SMC_SINGLE_LINK].rkey = rkey; lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = dma_addr; return i; } /* delete an rtoken */ int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey) { u32 rkey = ntohl(nw_rkey); int i; for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { if (lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey && test_bit(i, lgr->rtokens_used_mask)) { lgr->rtokens[i][SMC_SINGLE_LINK].rkey = 0; lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = 0; clear_bit(i, lgr->rtokens_used_mask); return 0; } } return -ENOENT; } /* save rkey and dma_addr received from peer during clc handshake */ int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_clc_msg_accept_confirm *clc) { conn->rtoken_idx = smc_rtoken_add(conn->lgr, clc->rmb_dma_addr, clc->rmb_rkey); if (conn->rtoken_idx < 0) return conn->rtoken_idx; return 0; } /* Called (from smc_exit) when module is removed */ void smc_core_exit(void) { struct smc_link_group *lgr, *lg; LIST_HEAD(lgr_freeing_list); spin_lock_bh(&smc_lgr_list.lock); if (!list_empty(&smc_lgr_list.list)) list_splice_init(&smc_lgr_list.list, &lgr_freeing_list); spin_unlock_bh(&smc_lgr_list.lock); list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) { list_del_init(&lgr->list); if (!lgr->is_smcd) { struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; if (lnk->state == SMC_LNK_ACTIVE) smc_llc_send_delete_link(lnk, SMC_LLC_REQ, false); smc_llc_link_inactive(lnk); } cancel_delayed_work_sync(&lgr->free_work); smc_lgr_free(lgr); /* free link group */ } } |
12 12 5 5 5 12 11 6 9 1 1 1 1 1 1 13 12 13 11 12 12 11 1 12 6 3 3 3 2 2 1 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 | /* * truncate.c * * PURPOSE * Truncate handling routines for the OSTA-UDF(tm) filesystem. * * COPYRIGHT * This file is distributed under the terms of the GNU General Public * License (GPL). Copies of the GPL can be obtained from: * ftp://prep.ai.mit.edu/pub/gnu/GPL * Each contributing author retains all rights to their own work. * * (C) 1999-2004 Ben Fennema * (C) 1999 Stelias Computing Inc * * HISTORY * * 02/24/99 blf Created. * */ #include "udfdecl.h" #include <linux/fs.h> #include <linux/mm.h> #include "udf_i.h" #include "udf_sb.h" static void extent_trunc(struct inode *inode, struct extent_position *epos, struct kernel_lb_addr *eloc, int8_t etype, uint32_t elen, uint32_t nelen) { struct kernel_lb_addr neloc = {}; int last_block = (elen + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; int first_block = (nelen + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; if (nelen) { if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { udf_free_blocks(inode->i_sb, inode, eloc, 0, last_block); etype = (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30); } else neloc = *eloc; nelen = (etype << 30) | nelen; } if (elen != nelen) { udf_write_aext(inode, epos, &neloc, nelen, 0); if (last_block > first_block) { if (etype == (EXT_RECORDED_ALLOCATED >> 30)) mark_inode_dirty(inode); if (etype != (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) udf_free_blocks(inode->i_sb, inode, eloc, first_block, last_block - first_block); } } } /* * Truncate the last extent to match i_size. This function assumes * that preallocation extent is already truncated. */ void udf_truncate_tail_extent(struct inode *inode) { struct extent_position epos = {}; struct kernel_lb_addr eloc; uint32_t elen, nelen; uint64_t lbcount = 0; int8_t etype = -1, netype; int adsize; struct udf_inode_info *iinfo = UDF_I(inode); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB || inode->i_size == iinfo->i_lenExtents) return; /* Are we going to delete the file anyway? */ if (inode->i_nlink == 0) return; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) adsize = sizeof(struct long_ad); else BUG(); /* Find the last extent in the file */ while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) { etype = netype; lbcount += elen; if (lbcount > inode->i_size) { if (lbcount - inode->i_size >= inode->i_sb->s_blocksize) udf_warn(inode->i_sb, "Too long extent after EOF in inode %u: i_size: %lld lbcount: %lld extent %u+%u\n", (unsigned)inode->i_ino, (long long)inode->i_size, (long long)lbcount, (unsigned)eloc.logicalBlockNum, (unsigned)elen); nelen = elen - (lbcount - inode->i_size); epos.offset -= adsize; extent_trunc(inode, &epos, &eloc, etype, elen, nelen); epos.offset += adsize; if (udf_next_aext(inode, &epos, &eloc, &elen, 1) != -1) udf_err(inode->i_sb, "Extent after EOF in inode %u\n", (unsigned)inode->i_ino); break; } } /* This inode entry is in-memory only and thus we don't have to mark * the inode dirty */ iinfo->i_lenExtents = inode->i_size; brelse(epos.bh); } void udf_discard_prealloc(struct inode *inode) { struct extent_position epos = {}; struct extent_position prev_epos = {}; struct kernel_lb_addr eloc; uint32_t elen; uint64_t lbcount = 0; int8_t etype = -1, netype; struct udf_inode_info *iinfo = UDF_I(inode); int bsize = 1 << inode->i_blkbits; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB || ALIGN(inode->i_size, bsize) == ALIGN(iinfo->i_lenExtents, bsize)) return; epos.block = iinfo->i_location; /* Find the last extent in the file */ while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 0)) != -1) { brelse(prev_epos.bh); prev_epos = epos; if (prev_epos.bh) get_bh(prev_epos.bh); etype = udf_next_aext(inode, &epos, &eloc, &elen, 1); lbcount += elen; } if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { lbcount -= elen; udf_delete_aext(inode, prev_epos); udf_free_blocks(inode->i_sb, inode, &eloc, 0, DIV_ROUND_UP(elen, 1 << inode->i_blkbits)); } /* This inode entry is in-memory only and thus we don't have to mark * the inode dirty */ iinfo->i_lenExtents = lbcount; brelse(epos.bh); brelse(prev_epos.bh); } static void udf_update_alloc_ext_desc(struct inode *inode, struct extent_position *epos, u32 lenalloc) { struct super_block *sb = inode->i_sb; struct udf_sb_info *sbi = UDF_SB(sb); struct allocExtDesc *aed = (struct allocExtDesc *) (epos->bh->b_data); int len = sizeof(struct allocExtDesc); aed->lengthAllocDescs = cpu_to_le32(lenalloc); if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT) || sbi->s_udfrev >= 0x0201) len += lenalloc; udf_update_tag(epos->bh->b_data, len); mark_buffer_dirty_inode(epos->bh, inode); } /* * Truncate extents of inode to inode->i_size. This function can be used only * for making file shorter. For making file longer, udf_extend_file() has to * be used. */ void udf_truncate_extents(struct inode *inode) { struct extent_position epos; struct kernel_lb_addr eloc, neloc = {}; uint32_t elen, nelen = 0, indirect_ext_len = 0, lenalloc; int8_t etype; struct super_block *sb = inode->i_sb; sector_t first_block = inode->i_size >> sb->s_blocksize_bits, offset; loff_t byte_offset; int adsize; struct udf_inode_info *iinfo = UDF_I(inode); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) adsize = sizeof(struct long_ad); else BUG(); etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset); byte_offset = (offset << sb->s_blocksize_bits) + (inode->i_size & (sb->s_blocksize - 1)); if (etype == -1) { /* We should extend the file? */ WARN_ON(byte_offset); return; } epos.offset -= adsize; extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset); epos.offset += adsize; if (byte_offset) lenalloc = epos.offset; else lenalloc = epos.offset - adsize; if (!epos.bh) lenalloc -= udf_file_entry_alloc_offset(inode); else lenalloc -= sizeof(struct allocExtDesc); while ((etype = udf_current_aext(inode, &epos, &eloc, &elen, 0)) != -1) { if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) { udf_write_aext(inode, &epos, &neloc, nelen, 0); if (indirect_ext_len) { /* We managed to free all extents in the * indirect extent - free it too */ BUG_ON(!epos.bh); udf_free_blocks(sb, NULL, &epos.block, 0, indirect_ext_len); } else if (!epos.bh) { iinfo->i_lenAlloc = lenalloc; mark_inode_dirty(inode); } else udf_update_alloc_ext_desc(inode, &epos, lenalloc); brelse(epos.bh); epos.offset = sizeof(struct allocExtDesc); epos.block = eloc; epos.bh = udf_tread(sb, udf_get_lb_pblock(sb, &eloc, 0)); /* Error reading indirect block? */ if (!epos.bh) return; if (elen) indirect_ext_len = (elen + sb->s_blocksize - 1) >> sb->s_blocksize_bits; else indirect_ext_len = 1; } else { extent_trunc(inode, &epos, &eloc, etype, elen, 0); epos.offset += adsize; } } if (indirect_ext_len) { BUG_ON(!epos.bh); udf_free_blocks(sb, NULL, &epos.block, 0, indirect_ext_len); } else if (!epos.bh) { iinfo->i_lenAlloc = lenalloc; mark_inode_dirty(inode); } else udf_update_alloc_ext_desc(inode, &epos, lenalloc); iinfo->i_lenExtents = inode->i_size; brelse(epos.bh); } |
1 18 15 18 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License version 2. */ #ifndef __INODE_DOT_H__ #define __INODE_DOT_H__ #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/mm.h> #include "util.h" extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask); extern int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos, unsigned size); extern void gfs2_set_aops(struct inode *inode); static inline int gfs2_is_stuffed(const struct gfs2_inode *ip) { return !ip->i_height; } static inline int gfs2_is_jdata(const struct gfs2_inode *ip) { return ip->i_diskflags & GFS2_DIF_JDATA; } static inline int gfs2_is_writeback(const struct gfs2_inode *ip) { const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); return (sdp->sd_args.ar_data == GFS2_DATA_WRITEBACK) && !gfs2_is_jdata(ip); } static inline int gfs2_is_ordered(const struct gfs2_inode *ip) { const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); return (sdp->sd_args.ar_data == GFS2_DATA_ORDERED) && !gfs2_is_jdata(ip); } static inline int gfs2_is_dir(const struct gfs2_inode *ip) { return S_ISDIR(ip->i_inode.i_mode); } static inline void gfs2_set_inode_blocks(struct inode *inode, u64 blocks) { inode->i_blocks = blocks << (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT); } static inline u64 gfs2_get_inode_blocks(const struct inode *inode) { return inode->i_blocks >> (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT); } static inline void gfs2_add_inode_blocks(struct inode *inode, s64 change) { gfs2_assert(GFS2_SB(inode), (change >= 0 || inode->i_blocks > -change)); change *= (GFS2_SB(inode)->sd_sb.sb_bsize/GFS2_BASIC_BLOCK); inode->i_blocks += change; } static inline int gfs2_check_inum(const struct gfs2_inode *ip, u64 no_addr, u64 no_formal_ino) { return ip->i_no_addr == no_addr && ip->i_no_formal_ino == no_formal_ino; } static inline void gfs2_inum_out(const struct gfs2_inode *ip, struct gfs2_dirent *dent) { dent->de_inum.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino); dent->de_inum.no_addr = cpu_to_be64(ip->i_no_addr); } static inline int gfs2_check_internal_file_size(struct inode *inode, u64 minsize, u64 maxsize) { u64 size = i_size_read(inode); if (size < minsize || size > maxsize) goto err; if (size & (BIT(inode->i_blkbits) - 1)) goto err; return 0; err: gfs2_consist_inode(GFS2_I(inode)); return -EIO; } extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, u64 no_addr, u64 no_formal_ino, unsigned int blktype); extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, u64 *no_formal_ino, unsigned int blktype); extern int gfs2_inode_refresh(struct gfs2_inode *ip); extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, int is_root); extern int gfs2_permission(struct inode *inode, int mask); extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr); extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); extern int gfs2_open_common(struct inode *inode, struct file *file); extern loff_t gfs2_seek_data(struct file *file, loff_t offset); extern loff_t gfs2_seek_hole(struct file *file, loff_t offset); extern const struct inode_operations gfs2_file_iops; extern const struct inode_operations gfs2_dir_iops; extern const struct inode_operations gfs2_symlink_iops; extern const struct file_operations gfs2_file_fops_nolock; extern const struct file_operations gfs2_dir_fops_nolock; extern void gfs2_set_inode_flags(struct inode *inode); #ifdef CONFIG_GFS2_FS_LOCKING_DLM extern const struct file_operations gfs2_file_fops; extern const struct file_operations gfs2_dir_fops; static inline int gfs2_localflocks(const struct gfs2_sbd *sdp) { return sdp->sd_args.ar_localflocks; } #else /* Single node only */ #define gfs2_file_fops gfs2_file_fops_nolock #define gfs2_dir_fops gfs2_dir_fops_nolock static inline int gfs2_localflocks(const struct gfs2_sbd *sdp) { return 1; } #endif /* CONFIG_GFS2_FS_LOCKING_DLM */ #endif /* __INODE_DOT_H__ */ |
34 22 35 6 35 22 34 38 18 6 12 20 20 7 13 45 45 45 45 14 14 2 2 2 2 47 22 7 7 25 25 25 47 4 47 46 46 46 9 45 7 46 9 45 46 46 45 45 44 45 7 45 42 45 34 45 42 45 34 45 45 45 45 1 45 23 22 45 45 47 44 44 45 45 45 45 45 23 22 45 23 44 45 44 45 44 45 45 45 38 57 58 59 57 2 58 57 3 3 3 3 5 2 2 4 8 8 8 65 64 65 45 65 53 45 45 14 13 140 114 102 102 101 102 102 31 143 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 | /* * linux/drivers/video/vga16.c -- VGA 16-color framebuffer driver * * Copyright 1999 Ben Pfaff <pfaffben@debian.org> and Petr Vandrovec <VANDROVE@vc.cvut.cz> * Based on VGA info at http://www.goodnet.com/~tinara/FreeVGA/home.htm * Based on VESA framebuffer (c) 1998 Gerd Knorr <kraxel@goldbach.in-berlin.de> * * This file is subject to the terms and conditions of the GNU General * Public License. See the file COPYING in the main directory of this * archive for more details. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/delay.h> #include <linux/fb.h> #include <linux/ioport.h> #include <linux/init.h> #include <linux/platform_device.h> #include <linux/screen_info.h> #include <asm/io.h> #include <video/vga.h> #define VGA_FB_PHYS 0xA0000 #define VGA_FB_PHYS_LEN 65536 #define MODE_SKIP4 1 #define MODE_8BPP 2 #define MODE_CFB 4 #define MODE_TEXT 8 /* --------------------------------------------------------------------- */ /* * card parameters */ struct vga16fb_par { /* structure holding original VGA register settings when the screen is blanked */ struct { unsigned char SeqCtrlIndex; /* Sequencer Index reg. */ unsigned char CrtCtrlIndex; /* CRT-Contr. Index reg. */ unsigned char CrtMiscIO; /* Miscellaneous register */ unsigned char HorizontalTotal; /* CRT-Controller:00h */ unsigned char HorizDisplayEnd; /* CRT-Controller:01h */ unsigned char StartHorizRetrace;/* CRT-Controller:04h */ unsigned char EndHorizRetrace; /* CRT-Controller:05h */ unsigned char Overflow; /* CRT-Controller:07h */ unsigned char StartVertRetrace; /* CRT-Controller:10h */ unsigned char EndVertRetrace; /* CRT-Controller:11h */ unsigned char ModeControl; /* CRT-Controller:17h */ unsigned char ClockingMode; /* Seq-Controller:01h */ } vga_state; struct vgastate state; unsigned int ref_count; int palette_blanked, vesa_blanked, mode, isVGA; u8 misc, pel_msk, vss, clkdiv; u8 crtc[VGA_CRT_C]; }; /* --------------------------------------------------------------------- */ static struct fb_var_screeninfo vga16fb_defined = { .xres = 640, .yres = 480, .xres_virtual = 640, .yres_virtual = 480, .bits_per_pixel = 4, .activate = FB_ACTIVATE_TEST, .height = -1, .width = -1, .pixclock = 39721, .left_margin = 48, .right_margin = 16, .upper_margin = 33, .lower_margin = 10, .hsync_len = 96, .vsync_len = 2, .vmode = FB_VMODE_NONINTERLACED, }; /* name should not depend on EGA/VGA */ static const struct fb_fix_screeninfo vga16fb_fix = { .id = "VGA16 VGA", .smem_start = VGA_FB_PHYS, .smem_len = VGA_FB_PHYS_LEN, .type = FB_TYPE_VGA_PLANES, .type_aux = FB_AUX_VGA_PLANES_VGA4, .visual = FB_VISUAL_PSEUDOCOLOR, .xpanstep = 8, .ypanstep = 1, .line_length = 640 / 8, .accel = FB_ACCEL_NONE }; /* The VGA's weird architecture often requires that we read a byte and write a byte to the same location. It doesn't matter *what* byte we write, however. This is because all the action goes on behind the scenes in the VGA's 32-bit latch register, and reading and writing video memory just invokes latch behavior. To avoid race conditions (is this necessary?), reading and writing the memory byte should be done with a single instruction. One suitable instruction is the x86 bitwise OR. The following read-modify-write routine should optimize to one such bitwise OR. */ static inline void rmw(volatile char __iomem *p) { readb(p); writeb(1, p); } /* Set the Graphics Mode Register, and return its previous value. Bits 0-1 are write mode, bit 3 is read mode. */ static inline int setmode(int mode) { int oldmode; oldmode = vga_io_rgfx(VGA_GFX_MODE); vga_io_w(VGA_GFX_D, mode); return oldmode; } /* Select the Bit Mask Register and return its value. */ static inline int selectmask(void) { return vga_io_rgfx(VGA_GFX_BIT_MASK); } /* Set the value of the Bit Mask Register. It must already have been selected with selectmask(). */ static inline void setmask(int mask) { vga_io_w(VGA_GFX_D, mask); } /* Set the Data Rotate Register and return its old value. Bits 0-2 are rotate count, bits 3-4 are logical operation (0=NOP, 1=AND, 2=OR, 3=XOR). */ static inline int setop(int op) { int oldop; oldop = vga_io_rgfx(VGA_GFX_DATA_ROTATE); vga_io_w(VGA_GFX_D, op); return oldop; } /* Set the Enable Set/Reset Register and return its old value. The code here always uses value 0xf for this register. */ static inline int setsr(int sr) { int oldsr; oldsr = vga_io_rgfx(VGA_GFX_SR_ENABLE); vga_io_w(VGA_GFX_D, sr); return oldsr; } /* Set the Set/Reset Register and return its old value. */ static inline int setcolor(int color) { int oldcolor; oldcolor = vga_io_rgfx(VGA_GFX_SR_VALUE); vga_io_w(VGA_GFX_D, color); return oldcolor; } /* Return the value in the Graphics Address Register. */ static inline int getindex(void) { return vga_io_r(VGA_GFX_I); } /* Set the value in the Graphics Address Register. */ static inline void setindex(int index) { vga_io_w(VGA_GFX_I, index); } static void vga16fb_pan_var(struct fb_info *info, struct fb_var_screeninfo *var) { struct vga16fb_par *par = info->par; u32 xoffset, pos; xoffset = var->xoffset; if (info->var.bits_per_pixel == 8) { pos = (info->var.xres_virtual * var->yoffset + xoffset) >> 2; } else if (par->mode & MODE_TEXT) { int fh = 16; // FIXME !!! font height. Fugde for now. pos = (info->var.xres_virtual * (var->yoffset / fh) + xoffset) >> 3; } else { if (info->var.nonstd) xoffset--; pos = (info->var.xres_virtual * var->yoffset + xoffset) >> 3; } vga_io_wcrt(VGA_CRTC_START_HI, pos >> 8); vga_io_wcrt(VGA_CRTC_START_LO, pos & 0xFF); /* if we support CFB4, then we must! support xoffset with pixel * granularity if someone supports xoffset in bit resolution */ vga_io_r(VGA_IS1_RC); /* reset flip-flop */ vga_io_w(VGA_ATT_IW, VGA_ATC_PEL); if (info->var.bits_per_pixel == 8) vga_io_w(VGA_ATT_IW, (xoffset & 3) << 1); else vga_io_w(VGA_ATT_IW, xoffset & 7); vga_io_r(VGA_IS1_RC); vga_io_w(VGA_ATT_IW, 0x20); } static void vga16fb_update_fix(struct fb_info *info) { if (info->var.bits_per_pixel == 4) { if (info->var.nonstd) { info->fix.type = FB_TYPE_PACKED_PIXELS; info->fix.line_length = info->var.xres_virtual / 2; } else { info->fix.type = FB_TYPE_VGA_PLANES; info->fix.type_aux = FB_AUX_VGA_PLANES_VGA4; info->fix.line_length = info->var.xres_virtual / 8; } } else if (info->var.bits_per_pixel == 0) { info->fix.type = FB_TYPE_TEXT; info->fix.type_aux = FB_AUX_TEXT_CGA; info->fix.line_length = info->var.xres_virtual / 4; } else { /* 8bpp */ if (info->var.nonstd) { info->fix.type = FB_TYPE_VGA_PLANES; info->fix.type_aux = FB_AUX_VGA_PLANES_CFB8; info->fix.line_length = info->var.xres_virtual / 4; } else { info->fix.type = FB_TYPE_PACKED_PIXELS; info->fix.line_length = info->var.xres_virtual; } } } static void vga16fb_clock_chip(struct vga16fb_par *par, unsigned int *pixclock, const struct fb_info *info, int mul, int div) { static const struct { u32 pixclock; u8 misc; u8 seq_clock_mode; } *ptr, *best, vgaclocks[] = { { 79442 /* 12.587 */, 0x00, 0x08}, { 70616 /* 14.161 */, 0x04, 0x08}, { 39721 /* 25.175 */, 0x00, 0x00}, { 35308 /* 28.322 */, 0x04, 0x00}, { 0 /* bad */, 0x00, 0x00}}; int err; *pixclock = (*pixclock * mul) / div; best = vgaclocks; err = *pixclock - best->pixclock; if (err < 0) err = -err; for (ptr = vgaclocks + 1; ptr->pixclock; ptr++) { int tmp; tmp = *pixclock - ptr->pixclock; if (tmp < 0) tmp = -tmp; if (tmp < err) { err = tmp; best = ptr; } } par->misc |= best->misc; par->clkdiv = best->seq_clock_mode; *pixclock = (best->pixclock * div) / mul; } #define FAIL(X) return -EINVAL static int vga16fb_open(struct fb_info *info, int user) { struct vga16fb_par *par = info->par; if (!par->ref_count) { memset(&par->state, 0, sizeof(struct vgastate)); par->state.flags = VGA_SAVE_FONTS | VGA_SAVE_MODE | VGA_SAVE_CMAP; save_vga(&par->state); } par->ref_count++; return 0; } static int vga16fb_release(struct fb_info *info, int user) { struct vga16fb_par *par = info->par; if (!par->ref_count) return -EINVAL; if (par->ref_count == 1) restore_vga(&par->state); par->ref_count--; return 0; } static int vga16fb_check_var(struct fb_var_screeninfo *var, struct fb_info *info) { struct vga16fb_par *par = info->par; u32 xres, right, hslen, left, xtotal; u32 yres, lower, vslen, upper, ytotal; u32 vxres, xoffset, vyres, yoffset; u32 pos; u8 r7, rMode; int shift; int mode; u32 maxmem; par->pel_msk = 0xFF; if (var->bits_per_pixel == 4) { if (var->nonstd) { if (!par->isVGA) return -EINVAL; shift = 3; mode = MODE_SKIP4 | MODE_CFB; maxmem = 16384; par->pel_msk = 0x0F; } else { shift = 3; mode = 0; maxmem = 65536; } } else if (var->bits_per_pixel == 8) { if (!par->isVGA) return -EINVAL; /* no support on EGA */ shift = 2; if (var->nonstd) { mode = MODE_8BPP | MODE_CFB; maxmem = 65536; } else { mode = MODE_SKIP4 | MODE_8BPP | MODE_CFB; maxmem = 16384; } } else return -EINVAL; xres = (var->xres + 7) & ~7; vxres = (var->xres_virtual + 0xF) & ~0xF; xoffset = (var->xoffset + 7) & ~7; left = (var->left_margin + 7) & ~7; right = (var->right_margin + 7) & ~7; hslen = (var->hsync_len + 7) & ~7; if (vxres < xres) vxres = xres; if (xres + xoffset > vxres) xoffset = vxres - xres; var->xres = xres; var->right_margin = right; var->hsync_len = hslen; var->left_margin = left; var->xres_virtual = vxres; var->xoffset = xoffset; xres >>= shift; right >>= shift; hslen >>= shift; left >>= shift; vxres >>= shift; xtotal = xres + right + hslen + left; if (xtotal >= 256) FAIL("xtotal too big"); if (hslen > 32) FAIL("hslen too big"); if (right + hslen + left > 64) FAIL("hblank too big"); par->crtc[VGA_CRTC_H_TOTAL] = xtotal - 5; par->crtc[VGA_CRTC_H_BLANK_START] = xres - 1; par->crtc[VGA_CRTC_H_DISP] = xres - 1; pos = xres + right; par->crtc[VGA_CRTC_H_SYNC_START] = pos; pos += hslen; par->crtc[VGA_CRTC_H_SYNC_END] = pos & 0x1F; pos += left - 2; /* blank_end + 2 <= total + 5 */ par->crtc[VGA_CRTC_H_BLANK_END] = (pos & 0x1F) | 0x80; if (pos & 0x20) par->crtc[VGA_CRTC_H_SYNC_END] |= 0x80; yres = var->yres; lower = var->lower_margin; vslen = var->vsync_len; upper = var->upper_margin; vyres = var->yres_virtual; yoffset = var->yoffset; if (yres > vyres) vyres = yres; if (vxres * vyres > maxmem) { vyres = maxmem / vxres; if (vyres < yres) return -ENOMEM; } if (yoffset + yres > vyres) yoffset = vyres - yres; var->yres = yres; var->lower_margin = lower; var->vsync_len = vslen; var->upper_margin = upper; var->yres_virtual = vyres; var->yoffset = yoffset; if (var->vmode & FB_VMODE_DOUBLE) { yres <<= 1; lower <<= 1; vslen <<= 1; upper <<= 1; } ytotal = yres + lower + vslen + upper; if (ytotal > 1024) { ytotal >>= 1; yres >>= 1; lower >>= 1; vslen >>= 1; upper >>= 1; rMode = 0x04; } else rMode = 0x00; if (ytotal > 1024) FAIL("ytotal too big"); if (vslen > 16) FAIL("vslen too big"); par->crtc[VGA_CRTC_V_TOTAL] = ytotal - 2; r7 = 0x10; /* disable linecompare */ if (ytotal & 0x100) r7 |= 0x01; if (ytotal & 0x200) r7 |= 0x20; par->crtc[VGA_CRTC_PRESET_ROW] = 0; par->crtc[VGA_CRTC_MAX_SCAN] = 0x40; /* 1 scanline, no linecmp */ if (var->vmode & FB_VMODE_DOUBLE) par->crtc[VGA_CRTC_MAX_SCAN] |= 0x80; par->crtc[VGA_CRTC_CURSOR_START] = 0x20; par->crtc[VGA_CRTC_CURSOR_END] = 0x00; if ((mode & (MODE_CFB | MODE_8BPP)) == MODE_CFB) xoffset--; pos = yoffset * vxres + (xoffset >> shift); par->crtc[VGA_CRTC_START_HI] = pos >> 8; par->crtc[VGA_CRTC_START_LO] = pos & 0xFF; par->crtc[VGA_CRTC_CURSOR_HI] = 0x00; par->crtc[VGA_CRTC_CURSOR_LO] = 0x00; pos = yres - 1; par->crtc[VGA_CRTC_V_DISP_END] = pos & 0xFF; par->crtc[VGA_CRTC_V_BLANK_START] = pos & 0xFF; if (pos & 0x100) r7 |= 0x0A; /* 0x02 -> DISP_END, 0x08 -> BLANK_START */ if (pos & 0x200) { r7 |= 0x40; /* 0x40 -> DISP_END */ par->crtc[VGA_CRTC_MAX_SCAN] |= 0x20; /* BLANK_START */ } pos += lower; par->crtc[VGA_CRTC_V_SYNC_START] = pos & 0xFF; if (pos & 0x100) r7 |= 0x04; if (pos & 0x200) r7 |= 0x80; pos += vslen; par->crtc[VGA_CRTC_V_SYNC_END] = (pos & 0x0F) & ~0x10; /* disabled IRQ */ pos += upper - 1; /* blank_end + 1 <= ytotal + 2 */ par->crtc[VGA_CRTC_V_BLANK_END] = pos & 0xFF; /* 0x7F for original VGA, but some SVGA chips requires all 8 bits to set */ if (vxres >= 512) FAIL("vxres too long"); par->crtc[VGA_CRTC_OFFSET] = vxres >> 1; if (mode & MODE_SKIP4) par->crtc[VGA_CRTC_UNDERLINE] = 0x5F; /* 256, cfb8 */ else par->crtc[VGA_CRTC_UNDERLINE] = 0x1F; /* 16, vgap */ par->crtc[VGA_CRTC_MODE] = rMode | ((mode & MODE_TEXT) ? 0xA3 : 0xE3); par->crtc[VGA_CRTC_LINE_COMPARE] = 0xFF; par->crtc[VGA_CRTC_OVERFLOW] = r7; par->vss = 0x00; /* 3DA */ par->misc = 0xE3; /* enable CPU, ports 0x3Dx, positive sync */ if (var->sync & FB_SYNC_HOR_HIGH_ACT) par->misc &= ~0x40; if (var->sync & FB_SYNC_VERT_HIGH_ACT) par->misc &= ~0x80; par->mode = mode; if (mode & MODE_8BPP) /* pixel clock == vga clock / 2 */ vga16fb_clock_chip(par, &var->pixclock, info, 1, 2); else /* pixel clock == vga clock */ vga16fb_clock_chip(par, &var->pixclock, info, 1, 1); var->red.offset = var->green.offset = var->blue.offset = var->transp.offset = 0; var->red.length = var->green.length = var->blue.length = (par->isVGA) ? 6 : 2; var->transp.length = 0; var->activate = FB_ACTIVATE_NOW; var->height = -1; var->width = -1; var->accel_flags = 0; return 0; } #undef FAIL static int vga16fb_set_par(struct fb_info *info) { struct vga16fb_par *par = info->par; u8 gdc[VGA_GFX_C]; u8 seq[VGA_SEQ_C]; u8 atc[VGA_ATT_C]; int fh, i; seq[VGA_SEQ_CLOCK_MODE] = 0x01 | par->clkdiv; if (par->mode & MODE_TEXT) seq[VGA_SEQ_PLANE_WRITE] = 0x03; else seq[VGA_SEQ_PLANE_WRITE] = 0x0F; seq[VGA_SEQ_CHARACTER_MAP] = 0x00; if (par->mode & MODE_TEXT) seq[VGA_SEQ_MEMORY_MODE] = 0x03; else if (par->mode & MODE_SKIP4) seq[VGA_SEQ_MEMORY_MODE] = 0x0E; else seq[VGA_SEQ_MEMORY_MODE] = 0x06; gdc[VGA_GFX_SR_VALUE] = 0x00; gdc[VGA_GFX_SR_ENABLE] = 0x00; gdc[VGA_GFX_COMPARE_VALUE] = 0x00; gdc[VGA_GFX_DATA_ROTATE] = 0x00; gdc[VGA_GFX_PLANE_READ] = 0; if (par->mode & MODE_TEXT) { gdc[VGA_GFX_MODE] = 0x10; gdc[VGA_GFX_MISC] = 0x06; } else { if (par->mode & MODE_CFB) gdc[VGA_GFX_MODE] = 0x40; else gdc[VGA_GFX_MODE] = 0x00; gdc[VGA_GFX_MISC] = 0x05; } gdc[VGA_GFX_COMPARE_MASK] = 0x0F; gdc[VGA_GFX_BIT_MASK] = 0xFF; for (i = 0x00; i < 0x10; i++) atc[i] = i; if (par->mode & MODE_TEXT) atc[VGA_ATC_MODE] = 0x04; else if (par->mode & MODE_8BPP) atc[VGA_ATC_MODE] = 0x41; else atc[VGA_ATC_MODE] = 0x81; atc[VGA_ATC_OVERSCAN] = 0x00; /* 0 for EGA, 0xFF for VGA */ atc[VGA_ATC_PLANE_ENABLE] = 0x0F; if (par->mode & MODE_8BPP) atc[VGA_ATC_PEL] = (info->var.xoffset & 3) << 1; else atc[VGA_ATC_PEL] = info->var.xoffset & 7; atc[VGA_ATC_COLOR_PAGE] = 0x00; if (par->mode & MODE_TEXT) { fh = 16; // FIXME !!! Fudge font height. par->crtc[VGA_CRTC_MAX_SCAN] = (par->crtc[VGA_CRTC_MAX_SCAN] & ~0x1F) | (fh - 1); } vga_io_w(VGA_MIS_W, vga_io_r(VGA_MIS_R) | 0x01); /* Enable graphics register modification */ if (!par->isVGA) { vga_io_w(EGA_GFX_E0, 0x00); vga_io_w(EGA_GFX_E1, 0x01); } /* update misc output register */ vga_io_w(VGA_MIS_W, par->misc); /* synchronous reset on */ vga_io_wseq(0x00, 0x01); if (par->isVGA) vga_io_w(VGA_PEL_MSK, par->pel_msk); /* write sequencer registers */ vga_io_wseq(VGA_SEQ_CLOCK_MODE, seq[VGA_SEQ_CLOCK_MODE] | 0x20); for (i = 2; i < VGA_SEQ_C; i++) { vga_io_wseq(i, seq[i]); } /* synchronous reset off */ vga_io_wseq(0x00, 0x03); /* deprotect CRT registers 0-7 */ vga_io_wcrt(VGA_CRTC_V_SYNC_END, par->crtc[VGA_CRTC_V_SYNC_END]); /* write CRT registers */ for (i = 0; i < VGA_CRTC_REGS; i++) { vga_io_wcrt(i, par->crtc[i]); } /* write graphics controller registers */ for (i = 0; i < VGA_GFX_C; i++) { vga_io_wgfx(i, gdc[i]); } /* write attribute controller registers */ for (i = 0; i < VGA_ATT_C; i++) { vga_io_r(VGA_IS1_RC); /* reset flip-flop */ vga_io_wattr(i, atc[i]); } /* Wait for screen to stabilize. */ mdelay(50); vga_io_wseq(VGA_SEQ_CLOCK_MODE, seq[VGA_SEQ_CLOCK_MODE]); vga_io_r(VGA_IS1_RC); vga_io_w(VGA_ATT_IW, 0x20); vga16fb_update_fix(info); return 0; } static void ega16_setpalette(int regno, unsigned red, unsigned green, unsigned blue) { static const unsigned char map[] = { 000, 001, 010, 011 }; int val; if (regno >= 16) return; val = map[red>>14] | ((map[green>>14]) << 1) | ((map[blue>>14]) << 2); vga_io_r(VGA_IS1_RC); /* ! 0x3BA */ vga_io_wattr(regno, val); vga_io_r(VGA_IS1_RC); /* some clones need it */ vga_io_w(VGA_ATT_IW, 0x20); /* unblank screen */ } static void vga16_setpalette(int regno, unsigned red, unsigned green, unsigned blue) { outb(regno, VGA_PEL_IW); outb(red >> 10, VGA_PEL_D); outb(green >> 10, VGA_PEL_D); outb(blue >> 10, VGA_PEL_D); } static int vga16fb_setcolreg(unsigned regno, unsigned red, unsigned green, unsigned blue, unsigned transp, struct fb_info *info) { struct vga16fb_par *par = info->par; int gray; /* * Set a single color register. The values supplied are * already rounded down to the hardware's capabilities * (according to the entries in the `var' structure). Return * != 0 for invalid regno. */ if (regno >= 256) return 1; gray = info->var.grayscale; if (gray) { /* gray = 0.30*R + 0.59*G + 0.11*B */ red = green = blue = (red * 77 + green * 151 + blue * 28) >> 8; } if (par->isVGA) vga16_setpalette(regno,red,green,blue); else ega16_setpalette(regno,red,green,blue); return 0; } static int vga16fb_pan_display(struct fb_var_screeninfo *var, struct fb_info *info) { vga16fb_pan_var(info, var); return 0; } /* The following VESA blanking code is taken from vgacon.c. The VGA blanking code was originally by Huang shi chao, and modified by Christoph Rimek (chrimek@toppoint.de) and todd j. derr (tjd@barefoot.org) for Linux. */ static void vga_vesa_blank(struct vga16fb_par *par, int mode) { unsigned char SeqCtrlIndex = vga_io_r(VGA_SEQ_I); unsigned char CrtCtrlIndex = vga_io_r(VGA_CRT_IC); /* save original values of VGA controller registers */ if(!par->vesa_blanked) { par->vga_state.CrtMiscIO = vga_io_r(VGA_MIS_R); //sti(); par->vga_state.HorizontalTotal = vga_io_rcrt(0x00); /* HorizontalTotal */ par->vga_state.HorizDisplayEnd = vga_io_rcrt(0x01); /* HorizDisplayEnd */ par->vga_state.StartHorizRetrace = vga_io_rcrt(0x04); /* StartHorizRetrace */ par->vga_state.EndHorizRetrace = vga_io_rcrt(0x05); /* EndHorizRetrace */ par->vga_state.Overflow = vga_io_rcrt(0x07); /* Overflow */ par->vga_state.StartVertRetrace = vga_io_rcrt(0x10); /* StartVertRetrace */ par->vga_state.EndVertRetrace = vga_io_rcrt(0x11); /* EndVertRetrace */ par->vga_state.ModeControl = vga_io_rcrt(0x17); /* ModeControl */ par->vga_state.ClockingMode = vga_io_rseq(0x01); /* ClockingMode */ } /* assure that video is enabled */ /* "0x20" is VIDEO_ENABLE_bit in register 01 of sequencer */ vga_io_wseq(0x01, par->vga_state.ClockingMode | 0x20); /* test for vertical retrace in process.... */ if ((par->vga_state.CrtMiscIO & 0x80) == 0x80) vga_io_w(VGA_MIS_W, par->vga_state.CrtMiscIO & 0xef); /* * Set <End of vertical retrace> to minimum (0) and * <Start of vertical Retrace> to maximum (incl. overflow) * Result: turn off vertical sync (VSync) pulse. */ if (mode & FB_BLANK_VSYNC_SUSPEND) { vga_io_wcrt(VGA_CRTC_V_SYNC_START, 0xff); vga_io_wcrt(VGA_CRTC_V_SYNC_END, 0x40); /* bits 9,10 of vert. retrace */ vga_io_wcrt(VGA_CRTC_OVERFLOW, par->vga_state.Overflow | 0x84); } if (mode & FB_BLANK_HSYNC_SUSPEND) { /* * Set <End of horizontal retrace> to minimum (0) and * <Start of horizontal Retrace> to maximum * Result: turn off horizontal sync (HSync) pulse. */ vga_io_wcrt(VGA_CRTC_H_SYNC_START, 0xff); vga_io_wcrt(VGA_CRTC_H_SYNC_END, 0x00); } /* restore both index registers */ outb_p(SeqCtrlIndex, VGA_SEQ_I); outb_p(CrtCtrlIndex, VGA_CRT_IC); } static void vga_vesa_unblank(struct vga16fb_par *par) { unsigned char SeqCtrlIndex = vga_io_r(VGA_SEQ_I); unsigned char CrtCtrlIndex = vga_io_r(VGA_CRT_IC); /* restore original values of VGA controller registers */ vga_io_w(VGA_MIS_W, par->vga_state.CrtMiscIO); /* HorizontalTotal */ vga_io_wcrt(0x00, par->vga_state.HorizontalTotal); /* HorizDisplayEnd */ vga_io_wcrt(0x01, par->vga_state.HorizDisplayEnd); /* StartHorizRetrace */ vga_io_wcrt(0x04, par->vga_state.StartHorizRetrace); /* EndHorizRetrace */ vga_io_wcrt(0x05, par->vga_state.EndHorizRetrace); /* Overflow */ vga_io_wcrt(0x07, par->vga_state.Overflow); /* StartVertRetrace */ vga_io_wcrt(0x10, par->vga_state.StartVertRetrace); /* EndVertRetrace */ vga_io_wcrt(0x11, par->vga_state.EndVertRetrace); /* ModeControl */ vga_io_wcrt(0x17, par->vga_state.ModeControl); /* ClockingMode */ vga_io_wseq(0x01, par->vga_state.ClockingMode); /* restore index/control registers */ vga_io_w(VGA_SEQ_I, SeqCtrlIndex); vga_io_w(VGA_CRT_IC, CrtCtrlIndex); } static void vga_pal_blank(void) { int i; for (i=0; i<16; i++) { outb_p(i, VGA_PEL_IW); outb_p(0, VGA_PEL_D); outb_p(0, VGA_PEL_D); outb_p(0, VGA_PEL_D); } } /* 0 unblank, 1 blank, 2 no vsync, 3 no hsync, 4 off */ static int vga16fb_blank(int blank, struct fb_info *info) { struct vga16fb_par *par = info->par; switch (blank) { case FB_BLANK_UNBLANK: /* Unblank */ if (par->vesa_blanked) { vga_vesa_unblank(par); par->vesa_blanked = 0; } if (par->palette_blanked) { par->palette_blanked = 0; } break; case FB_BLANK_NORMAL: /* blank */ vga_pal_blank(); par->palette_blanked = 1; break; default: /* VESA blanking */ vga_vesa_blank(par, blank); par->vesa_blanked = 1; break; } return 0; } static void vga_8planes_fillrect(struct fb_info *info, const struct fb_fillrect *rect) { u32 dx = rect->dx, width = rect->width; char oldindex = getindex(); char oldmode = setmode(0x40); char oldmask = selectmask(); int line_ofs, height; char oldop, oldsr; char __iomem *where; dx /= 4; where = info->screen_base + dx + rect->dy * info->fix.line_length; if (rect->rop == ROP_COPY) { oldop = setop(0); oldsr = setsr(0); width /= 4; line_ofs = info->fix.line_length - width; setmask(0xff); height = rect->height; while (height--) { int x; /* we can do memset... */ for (x = width; x > 0; --x) { writeb(rect->color, where); where++; } where += line_ofs; } } else { char oldcolor = setcolor(0xf); int y; oldop = setop(0x18); oldsr = setsr(0xf); setmask(0x0F); for (y = 0; y < rect->height; y++) { rmw(where); rmw(where+1); where += info->fix.line_length; } setcolor(oldcolor); } setmask(oldmask); setsr(oldsr); setop(oldop); setmode(oldmode); setindex(oldindex); } static void vga16fb_fillrect(struct fb_info *info, const struct fb_fillrect *rect) { int x, x2, y2, vxres, vyres, width, height, line_ofs; char __iomem *dst; vxres = info->var.xres_virtual; vyres = info->var.yres_virtual; if (!rect->width || !rect->height || rect->dx > vxres || rect->dy > vyres) return; /* We could use hardware clipping but on many cards you get around * hardware clipping by writing to framebuffer directly. */ x2 = rect->dx + rect->width; y2 = rect->dy + rect->height; x2 = x2 < vxres ? x2 : vxres; y2 = y2 < vyres ? y2 : vyres; width = x2 - rect->dx; switch (info->fix.type) { case FB_TYPE_VGA_PLANES: if (info->fix.type_aux == FB_AUX_VGA_PLANES_VGA4) { height = y2 - rect->dy; width = rect->width/8; line_ofs = info->fix.line_length - width; dst = info->screen_base + (rect->dx/8) + rect->dy * info->fix.line_length; switch (rect->rop) { case ROP_COPY: setmode(0); setop(0); setsr(0xf); setcolor(rect->color); selectmask(); setmask(0xff); while (height--) { for (x = 0; x < width; x++) { writeb(0, dst); dst++; } dst += line_ofs; } break; case ROP_XOR: setmode(0); setop(0x18); setsr(0xf); setcolor(0xf); selectmask(); setmask(0xff); while (height--) { for (x = 0; x < width; x++) { rmw(dst); dst++; } dst += line_ofs; } break; } } else vga_8planes_fillrect(info, rect); break; case FB_TYPE_PACKED_PIXELS: default: cfb_fillrect(info, rect); break; } } static void vga_8planes_copyarea(struct fb_info *info, const struct fb_copyarea *area) { char oldindex = getindex(); char oldmode = setmode(0x41); char oldop = setop(0); char oldsr = setsr(0xf); int height, line_ofs, x; u32 sx, dx, width; char __iomem *dest; char __iomem *src; height = area->height; sx = area->sx / 4; dx = area->dx / 4; width = area->width / 4; if (area->dy < area->sy || (area->dy == area->sy && dx < sx)) { line_ofs = info->fix.line_length - width; dest = info->screen_base + dx + area->dy * info->fix.line_length; src = info->screen_base + sx + area->sy * info->fix.line_length; while (height--) { for (x = 0; x < width; x++) { readb(src); writeb(0, dest); src++; dest++; } src += line_ofs; dest += line_ofs; } } else { line_ofs = info->fix.line_length - width; dest = info->screen_base + dx + width + (area->dy + height - 1) * info->fix.line_length; src = info->screen_base + sx + width + (area->sy + height - 1) * info->fix.line_length; while (height--) { for (x = 0; x < width; x++) { --src; --dest; readb(src); writeb(0, dest); } src -= line_ofs; dest -= line_ofs; } } setsr(oldsr); setop(oldop); setmode(oldmode); setindex(oldindex); } static void vga16fb_copyarea(struct fb_info *info, const struct fb_copyarea *area) { u32 dx = area->dx, dy = area->dy, sx = area->sx, sy = area->sy; int x, x2, y2, old_dx, old_dy, vxres, vyres; int height, width, line_ofs; char __iomem *dst = NULL; char __iomem *src = NULL; vxres = info->var.xres_virtual; vyres = info->var.yres_virtual; if (area->dx > vxres || area->sx > vxres || area->dy > vyres || area->sy > vyres) return; /* clip the destination */ old_dx = area->dx; old_dy = area->dy; /* * We could use hardware clipping but on many cards you get around * hardware clipping by writing to framebuffer directly. */ x2 = area->dx + area->width; y2 = area->dy + area->height; dx = area->dx > 0 ? area->dx : 0; dy = area->dy > 0 ? area->dy : 0; x2 = x2 < vxres ? x2 : vxres; y2 = y2 < vyres ? y2 : vyres; width = x2 - dx; height = y2 - dy; if (sx + dx < old_dx || sy + dy < old_dy) return; /* update sx1,sy1 */ sx += (dx - old_dx); sy += (dy - old_dy); /* the source must be completely inside the virtual screen */ if (sx + width > vxres || sy + height > vyres) return; switch (info->fix.type) { case FB_TYPE_VGA_PLANES: if (info->fix.type_aux == FB_AUX_VGA_PLANES_VGA4) { width = width/8; line_ofs = info->fix.line_length - width; setmode(1); setop(0); setsr(0xf); if (dy < sy || (dy == sy && dx < sx)) { dst = info->screen_base + (dx/8) + dy * info->fix.line_length; src = info->screen_base + (sx/8) + sy * info->fix.line_length; while (height--) { for (x = 0; x < width; x++) { readb(src); writeb(0, dst); dst++; src++; } src += line_ofs; dst += line_ofs; } } else { dst = info->screen_base + (dx/8) + width + (dy + height - 1) * info->fix.line_length; src = info->screen_base + (sx/8) + width + (sy + height - 1) * info->fix.line_length; while (height--) { for (x = 0; x < width; x++) { dst--; src--; readb(src); writeb(0, dst); } src -= line_ofs; dst -= line_ofs; } } } else vga_8planes_copyarea(info, area); break; case FB_TYPE_PACKED_PIXELS: default: cfb_copyarea(info, area); break; } } #define TRANS_MASK_LOW {0x0,0x8,0x4,0xC,0x2,0xA,0x6,0xE,0x1,0x9,0x5,0xD,0x3,0xB,0x7,0xF} #define TRANS_MASK_HIGH {0x000, 0x800, 0x400, 0xC00, 0x200, 0xA00, 0x600, 0xE00, \ 0x100, 0x900, 0x500, 0xD00, 0x300, 0xB00, 0x700, 0xF00} #if defined(__LITTLE_ENDIAN) static const u16 transl_l[] = TRANS_MASK_LOW; static const u16 transl_h[] = TRANS_MASK_HIGH; #elif defined(__BIG_ENDIAN) static const u16 transl_l[] = TRANS_MASK_HIGH; static const u16 transl_h[] = TRANS_MASK_LOW; #else #error "Only __BIG_ENDIAN and __LITTLE_ENDIAN are supported in vga-planes" #endif static void vga_8planes_imageblit(struct fb_info *info, const struct fb_image *image) { char oldindex = getindex(); char oldmode = setmode(0x40); char oldop = setop(0); char oldsr = setsr(0); char oldmask = selectmask(); const unsigned char *cdat = image->data; u32 dx = image->dx; char __iomem *where; int y; dx /= 4; where = info->screen_base + dx + image->dy * info->fix.line_length; setmask(0xff); writeb(image->bg_color, where); readb(where); selectmask(); setmask(image->fg_color ^ image->bg_color); setmode(0x42); setop(0x18); for (y = 0; y < image->height; y++, where += info->fix.line_length) writew(transl_h[cdat[y]&0xF] | transl_l[cdat[y] >> 4], where); setmask(oldmask); setsr(oldsr); setop(oldop); setmode(oldmode); setindex(oldindex); } static void vga_imageblit_expand(struct fb_info *info, const struct fb_image *image) { char __iomem *where = info->screen_base + (image->dx/8) + image->dy * info->fix.line_length; struct vga16fb_par *par = info->par; char *cdat = (char *) image->data; char __iomem *dst; int x, y; switch (info->fix.type) { case FB_TYPE_VGA_PLANES: if (info->fix.type_aux == FB_AUX_VGA_PLANES_VGA4) { if (par->isVGA) { setmode(2); setop(0); setsr(0xf); setcolor(image->fg_color); selectmask(); setmask(0xff); writeb(image->bg_color, where); rmb(); readb(where); /* fill latches */ setmode(3); wmb(); for (y = 0; y < image->height; y++) { dst = where; for (x = image->width/8; x--;) writeb(*cdat++, dst++); where += info->fix.line_length; } wmb(); } else { setmode(0); setop(0); setsr(0xf); setcolor(image->bg_color); selectmask(); setmask(0xff); for (y = 0; y < image->height; y++) { dst = where; for (x=image->width/8; x--;){ rmw(dst); setcolor(image->fg_color); selectmask(); if (*cdat) { setmask(*cdat++); rmw(dst++); } } where += info->fix.line_length; } } } else vga_8planes_imageblit(info, image); break; case FB_TYPE_PACKED_PIXELS: default: cfb_imageblit(info, image); break; } } static void vga_imageblit_color(struct fb_info *info, const struct fb_image *image) { /* * Draw logo */ struct vga16fb_par *par = info->par; char __iomem *where = info->screen_base + image->dy * info->fix.line_length + image->dx/8; const char *cdat = image->data; char __iomem *dst; int x, y; switch (info->fix.type) { case FB_TYPE_VGA_PLANES: if (info->fix.type_aux == FB_AUX_VGA_PLANES_VGA4 && par->isVGA) { setsr(0xf); setop(0); setmode(0); for (y = 0; y < image->height; y++) { for (x = 0; x < image->width; x++) { dst = where + x/8; setcolor(*cdat); selectmask(); setmask(1 << (7 - (x % 8))); fb_readb(dst); fb_writeb(0, dst); cdat++; } where += info->fix.line_length; } } break; case FB_TYPE_PACKED_PIXELS: cfb_imageblit(info, image); break; default: break; } } static void vga16fb_imageblit(struct fb_info *info, const struct fb_image *image) { if (image->depth == 1) vga_imageblit_expand(info, image); else vga_imageblit_color(info, image); } static void vga16fb_destroy(struct fb_info *info) { iounmap(info->screen_base); fb_dealloc_cmap(&info->cmap); /* XXX unshare VGA regions */ framebuffer_release(info); } static struct fb_ops vga16fb_ops = { .owner = THIS_MODULE, .fb_open = vga16fb_open, .fb_release = vga16fb_release, .fb_destroy = vga16fb_destroy, .fb_check_var = vga16fb_check_var, .fb_set_par = vga16fb_set_par, .fb_setcolreg = vga16fb_setcolreg, .fb_pan_display = vga16fb_pan_display, .fb_blank = vga16fb_blank, .fb_fillrect = vga16fb_fillrect, .fb_copyarea = vga16fb_copyarea, .fb_imageblit = vga16fb_imageblit, }; #ifndef MODULE static int __init vga16fb_setup(char *options) { char *this_opt; if (!options || !*options) return 0; while ((this_opt = strsep(&options, ",")) != NULL) { if (!*this_opt) continue; } return 0; } #endif static int vga16fb_probe(struct platform_device *dev) { struct fb_info *info; struct vga16fb_par *par; int i; int ret = 0; printk(KERN_DEBUG "vga16fb: initializing\n"); info = framebuffer_alloc(sizeof(struct vga16fb_par), &dev->dev); if (!info) { ret = -ENOMEM; goto err_fb_alloc; } info->apertures = alloc_apertures(1); if (!info->apertures) { ret = -ENOMEM; goto err_ioremap; } /* XXX share VGA_FB_PHYS and I/O region with vgacon and others */ info->screen_base = (void __iomem *)VGA_MAP_MEM(VGA_FB_PHYS, 0); if (!info->screen_base) { printk(KERN_ERR "vga16fb: unable to map device\n"); ret = -ENOMEM; goto err_ioremap; } printk(KERN_INFO "vga16fb: mapped to 0x%p\n", info->screen_base); par = info->par; par->isVGA = screen_info.orig_video_isVGA; par->palette_blanked = 0; par->vesa_blanked = 0; i = par->isVGA? 6 : 2; vga16fb_defined.red.length = i; vga16fb_defined.green.length = i; vga16fb_defined.blue.length = i; /* name should not depend on EGA/VGA */ info->fbops = &vga16fb_ops; info->var = vga16fb_defined; info->fix = vga16fb_fix; /* supports rectangles with widths of multiples of 8 */ info->pixmap.blit_x = 1 << 7 | 1 << 15 | 1 << 23 | 1 << 31; info->flags = FBINFO_FLAG_DEFAULT | FBINFO_MISC_FIRMWARE | FBINFO_HWACCEL_YPAN; i = (info->var.bits_per_pixel == 8) ? 256 : 16; ret = fb_alloc_cmap(&info->cmap, i, 0); if (ret) { printk(KERN_ERR "vga16fb: unable to allocate colormap\n"); ret = -ENOMEM; goto err_alloc_cmap; } if (vga16fb_check_var(&info->var, info)) { printk(KERN_ERR "vga16fb: unable to validate variable\n"); ret = -EINVAL; goto err_check_var; } vga16fb_update_fix(info); info->apertures->ranges[0].base = VGA_FB_PHYS; info->apertures->ranges[0].size = VGA_FB_PHYS_LEN; if (register_framebuffer(info) < 0) { printk(KERN_ERR "vga16fb: unable to register framebuffer\n"); ret = -EINVAL; goto err_check_var; } fb_info(info, "%s frame buffer device\n", info->fix.id); platform_set_drvdata(dev, info); return 0; err_check_var: fb_dealloc_cmap(&info->cmap); err_alloc_cmap: iounmap(info->screen_base); err_ioremap: framebuffer_release(info); err_fb_alloc: return ret; } static int vga16fb_remove(struct platform_device *dev) { struct fb_info *info = platform_get_drvdata(dev); if (info) unregister_framebuffer(info); return 0; } static struct platform_driver vga16fb_driver = { .probe = vga16fb_probe, .remove = vga16fb_remove, .driver = { .name = "vga16fb", }, }; static struct platform_device *vga16fb_device; static int __init vga16fb_init(void) { int ret; #ifndef MODULE char *option = NULL; if (fb_get_options("vga16fb", &option)) return -ENODEV; vga16fb_setup(option); #endif ret = platform_driver_register(&vga16fb_driver); if (!ret) { vga16fb_device = platform_device_alloc("vga16fb", 0); if (vga16fb_device) ret = platform_device_add(vga16fb_device); else ret = -ENOMEM; if (ret) { platform_device_put(vga16fb_device); platform_driver_unregister(&vga16fb_driver); } } return ret; } static void __exit vga16fb_exit(void) { platform_device_unregister(vga16fb_device); platform_driver_unregister(&vga16fb_driver); } MODULE_DESCRIPTION("Legacy VGA framebuffer device driver"); MODULE_LICENSE("GPL"); module_init(vga16fb_init); module_exit(vga16fb_exit); /* * Overrides for Emacs so that we follow Linus's tabbing style. * --------------------------------------------------------------------------- * Local variables: * c-basic-offset: 8 * End: */ |
45 19 30 19 57 57 57 57 56 57 57 57 37 57 30 30 30 30 30 30 30 37 57 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 | /* RxRPC packet transmission * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/net.h> #include <linux/gfp.h> #include <linux/skbuff.h> #include <linux/export.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include "ar-internal.h" struct rxrpc_ack_buffer { struct rxrpc_wire_header whdr; struct rxrpc_ackpacket ack; u8 acks[255]; u8 pad[3]; struct rxrpc_ackinfo ackinfo; }; struct rxrpc_abort_buffer { struct rxrpc_wire_header whdr; __be32 abort_code; }; static const char rxrpc_keepalive_string[] = ""; /* * Increase Tx backoff on transmission failure and clear it on success. */ static void rxrpc_tx_backoff(struct rxrpc_call *call, int ret) { if (ret < 0) { u16 tx_backoff = READ_ONCE(call->tx_backoff); if (tx_backoff < HZ) WRITE_ONCE(call->tx_backoff, tx_backoff + 1); } else { WRITE_ONCE(call->tx_backoff, 0); } } /* * Arrange for a keepalive ping a certain time after we last transmitted. This * lets the far side know we're still interested in this call and helps keep * the route through any intervening firewall open. * * Receiving a response to the ping will prevent the ->expect_rx_by timer from * expiring. */ static void rxrpc_set_keepalive(struct rxrpc_call *call) { unsigned long now = jiffies, keepalive_at = call->next_rx_timo / 6; keepalive_at += now; WRITE_ONCE(call->keepalive_at, keepalive_at); rxrpc_reduce_call_timer(call, keepalive_at, now, rxrpc_timer_set_for_keepalive); } /* * Fill out an ACK packet. */ static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn, struct rxrpc_call *call, struct rxrpc_ack_buffer *pkt, rxrpc_seq_t *_hard_ack, rxrpc_seq_t *_top, u8 reason) { rxrpc_serial_t serial; rxrpc_seq_t hard_ack, top, seq; int ix; u32 mtu, jmax; u8 *ackp = pkt->acks; /* Barrier against rxrpc_input_data(). */ serial = call->ackr_serial; hard_ack = READ_ONCE(call->rx_hard_ack); top = smp_load_acquire(&call->rx_top); *_hard_ack = hard_ack; *_top = top; pkt->ack.bufferSpace = htons(8); pkt->ack.maxSkew = htons(call->ackr_skew); pkt->ack.firstPacket = htonl(hard_ack + 1); pkt->ack.previousPacket = htonl(call->ackr_prev_seq); pkt->ack.serial = htonl(serial); pkt->ack.reason = reason; pkt->ack.nAcks = top - hard_ack; if (reason == RXRPC_ACK_PING) pkt->whdr.flags |= RXRPC_REQUEST_ACK; if (after(top, hard_ack)) { seq = hard_ack + 1; do { ix = seq & RXRPC_RXTX_BUFF_MASK; if (call->rxtx_buffer[ix]) *ackp++ = RXRPC_ACK_TYPE_ACK; else *ackp++ = RXRPC_ACK_TYPE_NACK; seq++; } while (before_eq(seq, top)); } mtu = conn->params.peer->if_mtu; mtu -= conn->params.peer->hdrsize; jmax = (call->nr_jumbo_bad > 3) ? 1 : rxrpc_rx_jumbo_max; pkt->ackinfo.rxMTU = htonl(rxrpc_rx_mtu); pkt->ackinfo.maxMTU = htonl(mtu); pkt->ackinfo.rwind = htonl(call->rx_winsize); pkt->ackinfo.jumbo_max = htonl(jmax); *ackp++ = 0; *ackp++ = 0; *ackp++ = 0; return top - hard_ack + 3; } /* * Send an ACK call packet. */ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, rxrpc_serial_t *_serial) { struct rxrpc_connection *conn; struct rxrpc_ack_buffer *pkt; struct msghdr msg; struct kvec iov[2]; rxrpc_serial_t serial; rxrpc_seq_t hard_ack, top; size_t len, n; int ret; u8 reason; if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) return -ECONNRESET; pkt = kzalloc(sizeof(*pkt), GFP_KERNEL); if (!pkt) return -ENOMEM; conn = call->conn; msg.msg_name = &call->peer->srx.transport; msg.msg_namelen = call->peer->srx.transport_len; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; pkt->whdr.epoch = htonl(conn->proto.epoch); pkt->whdr.cid = htonl(call->cid); pkt->whdr.callNumber = htonl(call->call_id); pkt->whdr.seq = 0; pkt->whdr.type = RXRPC_PACKET_TYPE_ACK; pkt->whdr.flags = RXRPC_SLOW_START_OK | conn->out_clientflag; pkt->whdr.userStatus = 0; pkt->whdr.securityIndex = call->security_ix; pkt->whdr._rsvd = 0; pkt->whdr.serviceId = htons(call->service_id); spin_lock_bh(&call->lock); if (ping) { reason = RXRPC_ACK_PING; } else { reason = call->ackr_reason; if (!call->ackr_reason) { spin_unlock_bh(&call->lock); ret = 0; goto out; } call->ackr_reason = 0; } n = rxrpc_fill_out_ack(conn, call, pkt, &hard_ack, &top, reason); spin_unlock_bh(&call->lock); iov[0].iov_base = pkt; iov[0].iov_len = sizeof(pkt->whdr) + sizeof(pkt->ack) + n; iov[1].iov_base = &pkt->ackinfo; iov[1].iov_len = sizeof(pkt->ackinfo); len = iov[0].iov_len + iov[1].iov_len; serial = atomic_inc_return(&conn->serial); pkt->whdr.serial = htonl(serial); trace_rxrpc_tx_ack(call->debug_id, serial, ntohl(pkt->ack.firstPacket), ntohl(pkt->ack.serial), pkt->ack.reason, pkt->ack.nAcks); if (_serial) *_serial = serial; if (ping) { call->ping_serial = serial; smp_wmb(); /* We need to stick a time in before we send the packet in case * the reply gets back before kernel_sendmsg() completes - but * asking UDP to send the packet can take a relatively long * time. */ call->ping_time = ktime_get_real(); set_bit(RXRPC_CALL_PINGING, &call->flags); trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_ping, serial); } ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); conn->params.peer->last_tx_at = ktime_get_seconds(); if (ret < 0) trace_rxrpc_tx_fail(call->debug_id, serial, ret, rxrpc_tx_point_call_ack); else trace_rxrpc_tx_packet(call->debug_id, &pkt->whdr, rxrpc_tx_point_call_ack); rxrpc_tx_backoff(call, ret); if (call->state < RXRPC_CALL_COMPLETE) { if (ret < 0) { if (ping) clear_bit(RXRPC_CALL_PINGING, &call->flags); rxrpc_propose_ACK(call, pkt->ack.reason, ntohs(pkt->ack.maxSkew), ntohl(pkt->ack.serial), false, true, rxrpc_propose_ack_retry_tx); } else { spin_lock_bh(&call->lock); if (after(hard_ack, call->ackr_consumed)) call->ackr_consumed = hard_ack; if (after(top, call->ackr_seen)) call->ackr_seen = top; spin_unlock_bh(&call->lock); } rxrpc_set_keepalive(call); } out: kfree(pkt); return ret; } /* * Send an ABORT call packet. */ int rxrpc_send_abort_packet(struct rxrpc_call *call) { struct rxrpc_connection *conn; struct rxrpc_abort_buffer pkt; struct msghdr msg; struct kvec iov[1]; rxrpc_serial_t serial; int ret; /* Don't bother sending aborts for a client call once the server has * hard-ACK'd all of its request data. After that point, we're not * going to stop the operation proceeding, and whilst we might limit * the reply, it's not worth it if we can send a new call on the same * channel instead, thereby closing off this call. */ if (rxrpc_is_client_call(call) && test_bit(RXRPC_CALL_TX_LAST, &call->flags)) return 0; if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) return -ECONNRESET; conn = call->conn; msg.msg_name = &call->peer->srx.transport; msg.msg_namelen = call->peer->srx.transport_len; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; pkt.whdr.epoch = htonl(conn->proto.epoch); pkt.whdr.cid = htonl(call->cid); pkt.whdr.callNumber = htonl(call->call_id); pkt.whdr.seq = 0; pkt.whdr.type = RXRPC_PACKET_TYPE_ABORT; pkt.whdr.flags = conn->out_clientflag; pkt.whdr.userStatus = 0; pkt.whdr.securityIndex = call->security_ix; pkt.whdr._rsvd = 0; pkt.whdr.serviceId = htons(call->service_id); pkt.abort_code = htonl(call->abort_code); iov[0].iov_base = &pkt; iov[0].iov_len = sizeof(pkt); serial = atomic_inc_return(&conn->serial); pkt.whdr.serial = htonl(serial); ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 1, sizeof(pkt)); conn->params.peer->last_tx_at = ktime_get_seconds(); if (ret < 0) trace_rxrpc_tx_fail(call->debug_id, serial, ret, rxrpc_tx_point_call_abort); else trace_rxrpc_tx_packet(call->debug_id, &pkt.whdr, rxrpc_tx_point_call_abort); rxrpc_tx_backoff(call, ret); return ret; } /* * send a packet through the transport endpoint */ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, bool retrans) { struct rxrpc_connection *conn = call->conn; struct rxrpc_wire_header whdr; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct msghdr msg; struct kvec iov[2]; rxrpc_serial_t serial; size_t len; bool lost = false; int ret, opt; _enter(",{%d}", skb->len); /* Each transmission of a Tx packet needs a new serial number */ serial = atomic_inc_return(&conn->serial); whdr.epoch = htonl(conn->proto.epoch); whdr.cid = htonl(call->cid); whdr.callNumber = htonl(call->call_id); whdr.seq = htonl(sp->hdr.seq); whdr.serial = htonl(serial); whdr.type = RXRPC_PACKET_TYPE_DATA; whdr.flags = sp->hdr.flags; whdr.userStatus = 0; whdr.securityIndex = call->security_ix; whdr._rsvd = htons(sp->hdr._rsvd); whdr.serviceId = htons(call->service_id); if (test_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags) && sp->hdr.seq == 1) whdr.userStatus = RXRPC_USERSTATUS_SERVICE_UPGRADE; iov[0].iov_base = &whdr; iov[0].iov_len = sizeof(whdr); iov[1].iov_base = skb->head; iov[1].iov_len = skb->len; len = iov[0].iov_len + iov[1].iov_len; msg.msg_name = &call->peer->srx.transport; msg.msg_namelen = call->peer->srx.transport_len; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; /* If our RTT cache needs working on, request an ACK. Also request * ACKs if a DATA packet appears to have been lost. * * However, we mustn't request an ACK on the last reply packet of a * service call, lest OpenAFS incorrectly send us an ACK with some * soft-ACKs in it and then never follow up with a proper hard ACK. */ if ((!(sp->hdr.flags & RXRPC_LAST_PACKET) || rxrpc_to_server(sp) ) && (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events) || retrans || call->cong_mode == RXRPC_CALL_SLOW_START || (call->peer->rtt_usage < 3 && sp->hdr.seq & 1) || ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), ktime_get_real()))) whdr.flags |= RXRPC_REQUEST_ACK; if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { static int lose; if ((lose++ & 7) == 7) { ret = 0; lost = true; goto done; } } _proto("Tx DATA %%%u { #%u }", serial, sp->hdr.seq); /* send the packet with the don't fragment bit set if we currently * think it's small enough */ if (iov[1].iov_len >= call->peer->maxdata) goto send_fragmentable; down_read(&conn->params.local->defrag_sem); sp->hdr.serial = serial; smp_wmb(); /* Set serial before timestamp */ skb->tstamp = ktime_get_real(); /* send the packet by UDP * - returns -EMSGSIZE if UDP would have to fragment the packet * to go out of the interface * - in which case, we'll have processed the ICMP error * message and update the peer record */ ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); conn->params.peer->last_tx_at = ktime_get_seconds(); up_read(&conn->params.local->defrag_sem); if (ret < 0) trace_rxrpc_tx_fail(call->debug_id, serial, ret, rxrpc_tx_point_call_data_nofrag); else trace_rxrpc_tx_packet(call->debug_id, &whdr, rxrpc_tx_point_call_data_nofrag); rxrpc_tx_backoff(call, ret); if (ret == -EMSGSIZE) goto send_fragmentable; done: trace_rxrpc_tx_data(call, sp->hdr.seq, serial, whdr.flags, retrans, lost); if (ret >= 0) { if (whdr.flags & RXRPC_REQUEST_ACK) { call->peer->rtt_last_req = skb->tstamp; trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_data, serial); if (call->peer->rtt_usage > 1) { unsigned long nowj = jiffies, ack_lost_at; ack_lost_at = nsecs_to_jiffies(2 * call->peer->rtt); if (ack_lost_at < 1) ack_lost_at = 1; ack_lost_at += nowj; WRITE_ONCE(call->ack_lost_at, ack_lost_at); rxrpc_reduce_call_timer(call, ack_lost_at, nowj, rxrpc_timer_set_for_lost_ack); } } if (sp->hdr.seq == 1 && !test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags)) { unsigned long nowj = jiffies, expect_rx_by; expect_rx_by = nowj + call->next_rx_timo; WRITE_ONCE(call->expect_rx_by, expect_rx_by); rxrpc_reduce_call_timer(call, expect_rx_by, nowj, rxrpc_timer_set_for_normal); } rxrpc_set_keepalive(call); } else { /* Cancel the call if the initial transmission fails, * particularly if that's due to network routing issues that * aren't going away anytime soon. The layer above can arrange * the retransmission. */ if (!test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags)) rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, RX_USER_ABORT, ret); } _leave(" = %d [%u]", ret, call->peer->maxdata); return ret; send_fragmentable: /* attempt to send this message with fragmentation enabled */ _debug("send fragment"); down_write(&conn->params.local->defrag_sem); sp->hdr.serial = serial; smp_wmb(); /* Set serial before timestamp */ skb->tstamp = ktime_get_real(); switch (conn->params.local->srx.transport.family) { case AF_INET6: case AF_INET: opt = IP_PMTUDISC_DONT; kernel_setsockopt(conn->params.local->socket, SOL_IP, IP_MTU_DISCOVER, (char *)&opt, sizeof(opt)); ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); conn->params.peer->last_tx_at = ktime_get_seconds(); opt = IP_PMTUDISC_DO; kernel_setsockopt(conn->params.local->socket, SOL_IP, IP_MTU_DISCOVER, (char *)&opt, sizeof(opt)); break; default: BUG(); } if (ret < 0) trace_rxrpc_tx_fail(call->debug_id, serial, ret, rxrpc_tx_point_call_data_frag); else trace_rxrpc_tx_packet(call->debug_id, &whdr, rxrpc_tx_point_call_data_frag); rxrpc_tx_backoff(call, ret); up_write(&conn->params.local->defrag_sem); goto done; } /* * reject packets through the local endpoint */ void rxrpc_reject_packets(struct rxrpc_local *local) { struct sockaddr_rxrpc srx; struct rxrpc_skb_priv *sp; struct rxrpc_wire_header whdr; struct sk_buff *skb; struct msghdr msg; struct kvec iov[2]; size_t size; __be32 code; int ret, ioc; _enter("%d", local->debug_id); iov[0].iov_base = &whdr; iov[0].iov_len = sizeof(whdr); iov[1].iov_base = &code; iov[1].iov_len = sizeof(code); msg.msg_name = &srx.transport; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; memset(&whdr, 0, sizeof(whdr)); while ((skb = skb_dequeue(&local->reject_queue))) { rxrpc_see_skb(skb, rxrpc_skb_rx_seen); sp = rxrpc_skb(skb); switch (skb->mark) { case RXRPC_SKB_MARK_REJECT_BUSY: whdr.type = RXRPC_PACKET_TYPE_BUSY; size = sizeof(whdr); ioc = 1; break; case RXRPC_SKB_MARK_REJECT_ABORT: whdr.type = RXRPC_PACKET_TYPE_ABORT; code = htonl(skb->priority); size = sizeof(whdr) + sizeof(code); ioc = 2; break; default: rxrpc_free_skb(skb, rxrpc_skb_rx_freed); continue; } if (rxrpc_extract_addr_from_skb(local, &srx, skb) == 0) { msg.msg_namelen = srx.transport_len; whdr.epoch = htonl(sp->hdr.epoch); whdr.cid = htonl(sp->hdr.cid); whdr.callNumber = htonl(sp->hdr.callNumber); whdr.serviceId = htons(sp->hdr.serviceId); whdr.flags = sp->hdr.flags; whdr.flags ^= RXRPC_CLIENT_INITIATED; whdr.flags &= RXRPC_CLIENT_INITIATED; ret = kernel_sendmsg(local->socket, &msg, iov, ioc, size); if (ret < 0) trace_rxrpc_tx_fail(local->debug_id, 0, ret, rxrpc_tx_point_reject); else trace_rxrpc_tx_packet(local->debug_id, &whdr, rxrpc_tx_point_reject); } rxrpc_free_skb(skb, rxrpc_skb_rx_freed); } _leave(""); } /* * Send a VERSION reply to a peer as a keepalive. */ void rxrpc_send_keepalive(struct rxrpc_peer *peer) { struct rxrpc_wire_header whdr; struct msghdr msg; struct kvec iov[2]; size_t len; int ret; _enter(""); msg.msg_name = &peer->srx.transport; msg.msg_namelen = peer->srx.transport_len; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; whdr.epoch = htonl(peer->local->rxnet->epoch); whdr.cid = 0; whdr.callNumber = 0; whdr.seq = 0; whdr.serial = 0; whdr.type = RXRPC_PACKET_TYPE_VERSION; /* Not client-initiated */ whdr.flags = RXRPC_LAST_PACKET; whdr.userStatus = 0; whdr.securityIndex = 0; whdr._rsvd = 0; whdr.serviceId = 0; iov[0].iov_base = &whdr; iov[0].iov_len = sizeof(whdr); iov[1].iov_base = (char *)rxrpc_keepalive_string; iov[1].iov_len = sizeof(rxrpc_keepalive_string); len = iov[0].iov_len + iov[1].iov_len; _proto("Tx VERSION (keepalive)"); ret = kernel_sendmsg(peer->local->socket, &msg, iov, 2, len); if (ret < 0) trace_rxrpc_tx_fail(peer->debug_id, 0, ret, rxrpc_tx_point_version_keepalive); else trace_rxrpc_tx_packet(peer->debug_id, &whdr, rxrpc_tx_point_version_keepalive); peer->last_tx_at = ktime_get_seconds(); _leave(""); } |
27 27 27 27 23 23 23 23 23 23 23 23 23 1 2 2 2 2 2 2 1 1 1 2 2 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 | // SPDX-License-Identifier: GPL-2.0-only /* * vivid-vbi-out.c - vbi output support functions. * * Copyright 2014 Cisco Systems, Inc. and/or its affiliates. All rights reserved. */ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/videodev2.h> #include <media/v4l2-common.h> #include "vivid-core.h" #include "vivid-kthread-out.h" #include "vivid-vbi-out.h" #include "vivid-vbi-cap.h" static int vbi_out_queue_setup(struct vb2_queue *vq, unsigned *nbuffers, unsigned *nplanes, unsigned sizes[], struct device *alloc_devs[]) { struct vivid_dev *dev = vb2_get_drv_priv(vq); bool is_60hz = dev->std_out & V4L2_STD_525_60; unsigned size = vq->type == V4L2_BUF_TYPE_SLICED_VBI_OUTPUT ? 36 * sizeof(struct v4l2_sliced_vbi_data) : 1440 * 2 * (is_60hz ? 12 : 18); if (!vivid_is_svid_out(dev)) return -EINVAL; sizes[0] = size; if (vq->num_buffers + *nbuffers < 2) *nbuffers = 2 - vq->num_buffers; *nplanes = 1; return 0; } static int vbi_out_buf_prepare(struct vb2_buffer *vb) { struct vivid_dev *dev = vb2_get_drv_priv(vb->vb2_queue); bool is_60hz = dev->std_out & V4L2_STD_525_60; unsigned size = vb->vb2_queue->type == V4L2_BUF_TYPE_SLICED_VBI_OUTPUT ? 36 * sizeof(struct v4l2_sliced_vbi_data) : 1440 * 2 * (is_60hz ? 12 : 18); dprintk(dev, 1, "%s\n", __func__); if (dev->buf_prepare_error) { /* * Error injection: test what happens if buf_prepare() returns * an error. */ dev->buf_prepare_error = false; return -EINVAL; } if (vb2_plane_size(vb, 0) < size) { dprintk(dev, 1, "%s data will not fit into plane (%lu < %u)\n", __func__, vb2_plane_size(vb, 0), size); return -EINVAL; } vb2_set_plane_payload(vb, 0, size); return 0; } static void vbi_out_buf_queue(struct vb2_buffer *vb) { struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); struct vivid_dev *dev = vb2_get_drv_priv(vb->vb2_queue); struct vivid_buffer *buf = container_of(vbuf, struct vivid_buffer, vb); dprintk(dev, 1, "%s\n", __func__); spin_lock(&dev->slock); list_add_tail(&buf->list, &dev->vbi_out_active); spin_unlock(&dev->slock); } static int vbi_out_start_streaming(struct vb2_queue *vq, unsigned count) { struct vivid_dev *dev = vb2_get_drv_priv(vq); int err; dprintk(dev, 1, "%s\n", __func__); dev->vbi_out_seq_count = 0; if (dev->start_streaming_error) { dev->start_streaming_error = false; err = -EINVAL; } else { err = vivid_start_generating_vid_out(dev, &dev->vbi_out_streaming); } if (err) { struct vivid_buffer *buf, *tmp; list_for_each_entry_safe(buf, tmp, &dev->vbi_out_active, list) { list_del(&buf->list); vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_QUEUED); } } return err; } /* abort streaming and wait for last buffer */ static void vbi_out_stop_streaming(struct vb2_queue *vq) { struct vivid_dev *dev = vb2_get_drv_priv(vq); dprintk(dev, 1, "%s\n", __func__); vivid_stop_generating_vid_out(dev, &dev->vbi_out_streaming); dev->vbi_out_have_wss = false; dev->vbi_out_have_cc[0] = false; dev->vbi_out_have_cc[1] = false; } const struct vb2_ops vivid_vbi_out_qops = { .queue_setup = vbi_out_queue_setup, .buf_prepare = vbi_out_buf_prepare, .buf_queue = vbi_out_buf_queue, .start_streaming = vbi_out_start_streaming, .stop_streaming = vbi_out_stop_streaming, .wait_prepare = vb2_ops_wait_prepare, .wait_finish = vb2_ops_wait_finish, }; int vidioc_g_fmt_vbi_out(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); struct v4l2_vbi_format *vbi = &f->fmt.vbi; bool is_60hz = dev->std_out & V4L2_STD_525_60; if (!vivid_is_svid_out(dev) || !dev->has_raw_vbi_out) return -EINVAL; vbi->sampling_rate = 25000000; vbi->offset = 24; vbi->samples_per_line = 1440; vbi->sample_format = V4L2_PIX_FMT_GREY; vbi->start[0] = is_60hz ? V4L2_VBI_ITU_525_F1_START + 9 : V4L2_VBI_ITU_625_F1_START + 5; vbi->start[1] = is_60hz ? V4L2_VBI_ITU_525_F2_START + 9 : V4L2_VBI_ITU_625_F2_START + 5; vbi->count[0] = vbi->count[1] = is_60hz ? 12 : 18; vbi->flags = dev->vbi_cap_interlaced ? V4L2_VBI_INTERLACED : 0; vbi->reserved[0] = 0; vbi->reserved[1] = 0; return 0; } int vidioc_s_fmt_vbi_out(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); int ret = vidioc_g_fmt_vbi_out(file, priv, f); if (ret) return ret; if (vb2_is_busy(&dev->vb_vbi_out_q)) return -EBUSY; dev->stream_sliced_vbi_out = false; dev->vbi_out_dev.queue->type = V4L2_BUF_TYPE_VBI_OUTPUT; return 0; } int vidioc_g_fmt_sliced_vbi_out(struct file *file, void *fh, struct v4l2_format *fmt) { struct vivid_dev *dev = video_drvdata(file); struct v4l2_sliced_vbi_format *vbi = &fmt->fmt.sliced; if (!vivid_is_svid_out(dev) || !dev->has_sliced_vbi_out) return -EINVAL; vivid_fill_service_lines(vbi, dev->service_set_out); return 0; } int vidioc_try_fmt_sliced_vbi_out(struct file *file, void *fh, struct v4l2_format *fmt) { struct vivid_dev *dev = video_drvdata(file); struct v4l2_sliced_vbi_format *vbi = &fmt->fmt.sliced; bool is_60hz = dev->std_out & V4L2_STD_525_60; u32 service_set = vbi->service_set; if (!vivid_is_svid_out(dev) || !dev->has_sliced_vbi_out) return -EINVAL; service_set &= is_60hz ? V4L2_SLICED_CAPTION_525 : V4L2_SLICED_WSS_625 | V4L2_SLICED_TELETEXT_B; vivid_fill_service_lines(vbi, service_set); return 0; } int vidioc_s_fmt_sliced_vbi_out(struct file *file, void *fh, struct v4l2_format *fmt) { struct vivid_dev *dev = video_drvdata(file); struct v4l2_sliced_vbi_format *vbi = &fmt->fmt.sliced; int ret = vidioc_try_fmt_sliced_vbi_out(file, fh, fmt); if (ret) return ret; if (vb2_is_busy(&dev->vb_vbi_out_q)) return -EBUSY; dev->service_set_out = vbi->service_set; dev->stream_sliced_vbi_out = true; dev->vbi_out_dev.queue->type = V4L2_BUF_TYPE_SLICED_VBI_OUTPUT; return 0; } void vivid_sliced_vbi_out_process(struct vivid_dev *dev, struct vivid_buffer *buf) { struct v4l2_sliced_vbi_data *vbi = vb2_plane_vaddr(&buf->vb.vb2_buf, 0); unsigned elems = vb2_get_plane_payload(&buf->vb.vb2_buf, 0) / sizeof(*vbi); dev->vbi_out_have_cc[0] = false; dev->vbi_out_have_cc[1] = false; dev->vbi_out_have_wss = false; while (elems--) { switch (vbi->id) { case V4L2_SLICED_CAPTION_525: if ((dev->std_out & V4L2_STD_525_60) && vbi->line == 21) { dev->vbi_out_have_cc[!!vbi->field] = true; dev->vbi_out_cc[!!vbi->field][0] = vbi->data[0]; dev->vbi_out_cc[!!vbi->field][1] = vbi->data[1]; } break; case V4L2_SLICED_WSS_625: if ((dev->std_out & V4L2_STD_625_50) && vbi->field == 0 && vbi->line == 23) { dev->vbi_out_have_wss = true; dev->vbi_out_wss[0] = vbi->data[0]; dev->vbi_out_wss[1] = vbi->data[1]; } break; } vbi++; } } |
4 4 4 3 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/kernel.h> #include <linux/capability.h> #include <linux/if.h> #include <linux/inetdevice.h> #include <linux/ip.h> #include <linux/list.h> #include <linux/rculist.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/tcp.h> #include <net/ip.h> #include <net/tcp.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/x_tables.h> #include <net/netfilter/nf_log.h> #include <linux/netfilter/nfnetlink_osf.h> /* * Indexed by dont-fragment bit. * It is the only constant value in the fingerprint. */ struct list_head nf_osf_fingers[2]; EXPORT_SYMBOL_GPL(nf_osf_fingers); static inline int nf_osf_ttl(const struct sk_buff *skb, int ttl_check, unsigned char f_ttl) { const struct iphdr *ip = ip_hdr(skb); if (ttl_check != -1) { if (ttl_check == NF_OSF_TTL_TRUE) return ip->ttl == f_ttl; if (ttl_check == NF_OSF_TTL_NOCHECK) return 1; else if (ip->ttl <= f_ttl) return 1; else { struct in_device *in_dev = __in_dev_get_rcu(skb->dev); int ret = 0; for_ifa(in_dev) { if (inet_ifa_match(ip->saddr, ifa)) { ret = (ip->ttl == f_ttl); break; } } endfor_ifa(in_dev); return ret; } } return ip->ttl == f_ttl; } struct nf_osf_hdr_ctx { bool df; u16 window; u16 totlen; const unsigned char *optp; unsigned int optsize; }; static bool nf_osf_match_one(const struct sk_buff *skb, const struct nf_osf_user_finger *f, int ttl_check, struct nf_osf_hdr_ctx *ctx) { const __u8 *optpinit = ctx->optp; unsigned int check_WSS = 0; int fmatch = FMATCH_WRONG; int foptsize, optnum; u16 mss = 0; if (ctx->totlen != f->ss || !nf_osf_ttl(skb, ttl_check, f->ttl)) return false; /* * Should not happen if userspace parser was written correctly. */ if (f->wss.wc >= OSF_WSS_MAX) return false; /* Check options */ foptsize = 0; for (optnum = 0; optnum < f->opt_num; ++optnum) foptsize += f->opt[optnum].length; if (foptsize > MAX_IPOPTLEN || ctx->optsize > MAX_IPOPTLEN || ctx->optsize != foptsize) return false; check_WSS = f->wss.wc; for (optnum = 0; optnum < f->opt_num; ++optnum) { if (f->opt[optnum].kind == *ctx->optp) { __u32 len = f->opt[optnum].length; const __u8 *optend = ctx->optp + len; fmatch = FMATCH_OK; switch (*ctx->optp) { case OSFOPT_MSS: mss = ctx->optp[3]; mss <<= 8; mss |= ctx->optp[2]; mss = ntohs((__force __be16)mss); break; case OSFOPT_TS: break; } ctx->optp = optend; } else fmatch = FMATCH_OPT_WRONG; if (fmatch != FMATCH_OK) break; } if (fmatch != FMATCH_OPT_WRONG) { fmatch = FMATCH_WRONG; switch (check_WSS) { case OSF_WSS_PLAIN: if (f->wss.val == 0 || ctx->window == f->wss.val) fmatch = FMATCH_OK; break; case OSF_WSS_MSS: /* * Some smart modems decrease mangle MSS to * SMART_MSS_2, so we check standard, decreased * and the one provided in the fingerprint MSS * values. */ #define SMART_MSS_1 1460 #define SMART_MSS_2 1448 if (ctx->window == f->wss.val * mss || ctx->window == f->wss.val * SMART_MSS_1 || ctx->window == f->wss.val * SMART_MSS_2) fmatch = FMATCH_OK; break; case OSF_WSS_MTU: if (ctx->window == f->wss.val * (mss + 40) || ctx->window == f->wss.val * (SMART_MSS_1 + 40) || ctx->window == f->wss.val * (SMART_MSS_2 + 40)) fmatch = FMATCH_OK; break; case OSF_WSS_MODULO: if ((ctx->window % f->wss.val) == 0) fmatch = FMATCH_OK; break; } } if (fmatch != FMATCH_OK) ctx->optp = optpinit; return fmatch == FMATCH_OK; } static const struct tcphdr *nf_osf_hdr_ctx_init(struct nf_osf_hdr_ctx *ctx, const struct sk_buff *skb, const struct iphdr *ip, unsigned char *opts, struct tcphdr *_tcph) { const struct tcphdr *tcp; tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), _tcph); if (!tcp) return NULL; if (!tcp->syn) return NULL; ctx->totlen = ntohs(ip->tot_len); ctx->df = ntohs(ip->frag_off) & IP_DF; ctx->window = ntohs(tcp->window); if (tcp->doff * 4 > sizeof(struct tcphdr)) { ctx->optsize = tcp->doff * 4 - sizeof(struct tcphdr); ctx->optp = skb_header_pointer(skb, ip_hdrlen(skb) + sizeof(struct tcphdr), ctx->optsize, opts); if (!ctx->optp) return NULL; } return tcp; } bool nf_osf_match(const struct sk_buff *skb, u_int8_t family, int hooknum, struct net_device *in, struct net_device *out, const struct nf_osf_info *info, struct net *net, const struct list_head *nf_osf_fingers) { const struct iphdr *ip = ip_hdr(skb); const struct nf_osf_user_finger *f; unsigned char opts[MAX_IPOPTLEN]; const struct nf_osf_finger *kf; int fcount = 0, ttl_check; int fmatch = FMATCH_WRONG; struct nf_osf_hdr_ctx ctx; const struct tcphdr *tcp; struct tcphdr _tcph; memset(&ctx, 0, sizeof(ctx)); tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts, &_tcph); if (!tcp) return false; ttl_check = (info->flags & NF_OSF_TTL) ? info->ttl : -1; list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) { f = &kf->finger; if (!(info->flags & NF_OSF_LOG) && strcmp(info->genre, f->genre)) continue; if (!nf_osf_match_one(skb, f, ttl_check, &ctx)) continue; fmatch = FMATCH_OK; fcount++; if (info->flags & NF_OSF_LOG) nf_log_packet(net, family, hooknum, skb, in, out, NULL, "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n", f->genre, f->version, f->subtype, &ip->saddr, ntohs(tcp->source), &ip->daddr, ntohs(tcp->dest), f->ttl - ip->ttl); if ((info->flags & NF_OSF_LOG) && info->loglevel == NF_OSF_LOGLEVEL_FIRST) break; } if (!fcount && (info->flags & NF_OSF_LOG)) nf_log_packet(net, family, hooknum, skb, in, out, NULL, "Remote OS is not known: %pI4:%u -> %pI4:%u\n", &ip->saddr, ntohs(tcp->source), &ip->daddr, ntohs(tcp->dest)); if (fcount) fmatch = FMATCH_OK; return fmatch == FMATCH_OK; } EXPORT_SYMBOL_GPL(nf_osf_match); const char *nf_osf_find(const struct sk_buff *skb, const struct list_head *nf_osf_fingers) { const struct iphdr *ip = ip_hdr(skb); const struct nf_osf_user_finger *f; unsigned char opts[MAX_IPOPTLEN]; const struct nf_osf_finger *kf; struct nf_osf_hdr_ctx ctx; const struct tcphdr *tcp; const char *genre = NULL; struct tcphdr _tcph; memset(&ctx, 0, sizeof(ctx)); tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts, &_tcph); if (!tcp) return NULL; list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) { f = &kf->finger; if (!nf_osf_match_one(skb, f, -1, &ctx)) continue; genre = f->genre; break; } return genre; } EXPORT_SYMBOL_GPL(nf_osf_find); static const struct nla_policy nfnl_osf_policy[OSF_ATTR_MAX + 1] = { [OSF_ATTR_FINGER] = { .len = sizeof(struct nf_osf_user_finger) }, }; static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const osf_attrs[], struct netlink_ext_ack *extack) { struct nf_osf_user_finger *f; struct nf_osf_finger *kf = NULL, *sf; int err = 0; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (!osf_attrs[OSF_ATTR_FINGER]) return -EINVAL; if (!(nlh->nlmsg_flags & NLM_F_CREATE)) return -EINVAL; f = nla_data(osf_attrs[OSF_ATTR_FINGER]); kf = kmalloc(sizeof(struct nf_osf_finger), GFP_KERNEL); if (!kf) return -ENOMEM; memcpy(&kf->finger, f, sizeof(struct nf_osf_user_finger)); list_for_each_entry(sf, &nf_osf_fingers[!!f->df], finger_entry) { if (memcmp(&sf->finger, f, sizeof(struct nf_osf_user_finger))) continue; kfree(kf); kf = NULL; if (nlh->nlmsg_flags & NLM_F_EXCL) err = -EEXIST; break; } /* * We are protected by nfnl mutex. */ if (kf) list_add_tail_rcu(&kf->finger_entry, &nf_osf_fingers[!!f->df]); return err; } static int nfnl_osf_remove_callback(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const osf_attrs[], struct netlink_ext_ack *extack) { struct nf_osf_user_finger *f; struct nf_osf_finger *sf; int err = -ENOENT; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (!osf_attrs[OSF_ATTR_FINGER]) return -EINVAL; f = nla_data(osf_attrs[OSF_ATTR_FINGER]); list_for_each_entry(sf, &nf_osf_fingers[!!f->df], finger_entry) { if (memcmp(&sf->finger, f, sizeof(struct nf_osf_user_finger))) continue; /* * We are protected by nfnl mutex. */ list_del_rcu(&sf->finger_entry); kfree_rcu(sf, rcu_head); err = 0; break; } return err; } static const struct nfnl_callback nfnl_osf_callbacks[OSF_MSG_MAX] = { [OSF_MSG_ADD] = { .call = nfnl_osf_add_callback, .attr_count = OSF_ATTR_MAX, .policy = nfnl_osf_policy, }, [OSF_MSG_REMOVE] = { .call = nfnl_osf_remove_callback, .attr_count = OSF_ATTR_MAX, .policy = nfnl_osf_policy, }, }; static const struct nfnetlink_subsystem nfnl_osf_subsys = { .name = "osf", .subsys_id = NFNL_SUBSYS_OSF, .cb_count = OSF_MSG_MAX, .cb = nfnl_osf_callbacks, }; static int __init nfnl_osf_init(void) { int err = -EINVAL; int i; for (i = 0; i < ARRAY_SIZE(nf_osf_fingers); ++i) INIT_LIST_HEAD(&nf_osf_fingers[i]); err = nfnetlink_subsys_register(&nfnl_osf_subsys); if (err < 0) { pr_err("Failed to register OSF nsfnetlink helper (%d)\n", err); goto err_out_exit; } return 0; err_out_exit: return err; } static void __exit nfnl_osf_fini(void) { struct nf_osf_finger *f; int i; nfnetlink_subsys_unregister(&nfnl_osf_subsys); rcu_read_lock(); for (i = 0; i < ARRAY_SIZE(nf_osf_fingers); ++i) { list_for_each_entry_rcu(f, &nf_osf_fingers[i], finger_entry) { list_del_rcu(&f->finger_entry); kfree_rcu(f, rcu_head); } } rcu_read_unlock(); rcu_barrier(); } module_init(nfnl_osf_init); module_exit(nfnl_osf_fini); MODULE_LICENSE("GPL"); |
110 82 110 110 110 82 82 82 82 471 10147 109 10135 10132 10119 10114 10114 10149 10118 10118 10147 6952 6958 6955 6908 6953 6911 6908 6956 10086 6143 989 3 988 9263 5845 430 12 430 1863 4099 2 2 2 3 3 3 3 3 3 8 8 2 2 3 3 3 3 7 1 1 2 2 2 2 3 3 6 5 3 2 1 1 1 6 25 25 25 242 303 61 21 21 21 1 5 1 4 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 | /* * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation, version 2 of the * License. */ #include <linux/export.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/user_namespace.h> #include <linux/proc_ns.h> #include <linux/highuid.h> #include <linux/cred.h> #include <linux/securebits.h> #include <linux/keyctl.h> #include <linux/key-type.h> #include <keys/user-type.h> #include <linux/seq_file.h> #include <linux/fs.h> #include <linux/uaccess.h> #include <linux/ctype.h> #include <linux/projid.h> #include <linux/fs_struct.h> #include <linux/bsearch.h> #include <linux/sort.h> static struct kmem_cache *user_ns_cachep __read_mostly; static DEFINE_MUTEX(userns_state_mutex); static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *map); static void free_user_ns(struct work_struct *work); static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid) { return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES); } static void dec_user_namespaces(struct ucounts *ucounts) { return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES); } static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) { /* Start with the same capabilities as init but useless for doing * anything as the capabilities are bound to the new user namespace. */ cred->securebits = SECUREBITS_DEFAULT; cred->cap_inheritable = CAP_EMPTY_SET; cred->cap_permitted = CAP_FULL_SET; cred->cap_effective = CAP_FULL_SET; cred->cap_ambient = CAP_EMPTY_SET; cred->cap_bset = CAP_FULL_SET; #ifdef CONFIG_KEYS key_put(cred->request_key_auth); cred->request_key_auth = NULL; #endif /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ cred->user_ns = user_ns; } /* * Create a new user namespace, deriving the creator from the user in the * passed credentials, and replacing that user with the new root user for the * new namespace. * * This is called by copy_creds(), which will finish setting the target task's * credentials. */ int create_user_ns(struct cred *new) { struct user_namespace *ns, *parent_ns = new->user_ns; kuid_t owner = new->euid; kgid_t group = new->egid; struct ucounts *ucounts; int ret, i; ret = -ENOSPC; if (parent_ns->level > 32) goto fail; ucounts = inc_user_namespaces(parent_ns, owner); if (!ucounts) goto fail; /* * Verify that we can not violate the policy of which files * may be accessed that is specified by the root directory, * by verifing that the root directory is at the root of the * mount namespace which allows all files to be accessed. */ ret = -EPERM; if (current_chrooted()) goto fail_dec; /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who * created a user_namespace. */ ret = -EPERM; if (!kuid_has_mapping(parent_ns, owner) || !kgid_has_mapping(parent_ns, group)) goto fail_dec; ret = -ENOMEM; ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); if (!ns) goto fail_dec; ret = ns_alloc_inum(&ns->ns); if (ret) goto fail_free; ns->ns.ops = &userns_operations; atomic_set(&ns->count, 1); /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; ns->level = parent_ns->level + 1; ns->owner = owner; ns->group = group; INIT_WORK(&ns->work, free_user_ns); for (i = 0; i < UCOUNT_COUNTS; i++) { ns->ucount_max[i] = INT_MAX; } ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ mutex_lock(&userns_state_mutex); ns->flags = parent_ns->flags; mutex_unlock(&userns_state_mutex); #ifdef CONFIG_PERSISTENT_KEYRINGS init_rwsem(&ns->persistent_keyring_register_sem); #endif ret = -ENOMEM; if (!setup_userns_sysctls(ns)) goto fail_keyring; set_cred_user_ns(new, ns); return 0; fail_keyring: #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif ns_free_inum(&ns->ns); fail_free: kmem_cache_free(user_ns_cachep, ns); fail_dec: dec_user_namespaces(ucounts); fail: return ret; } int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) { struct cred *cred; int err = -ENOMEM; if (!(unshare_flags & CLONE_NEWUSER)) return 0; cred = prepare_creds(); if (cred) { err = create_user_ns(cred); if (err) put_cred(cred); else *new_cred = cred; } return err; } static void free_user_ns(struct work_struct *work) { struct user_namespace *parent, *ns = container_of(work, struct user_namespace, work); do { struct ucounts *ucounts = ns->ucounts; parent = ns->parent; if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->gid_map.forward); kfree(ns->gid_map.reverse); } if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->uid_map.forward); kfree(ns->uid_map.reverse); } if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->projid_map.forward); kfree(ns->projid_map.reverse); } retire_userns_sysctls(ns); #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif ns_free_inum(&ns->ns); kmem_cache_free(user_ns_cachep, ns); dec_user_namespaces(ucounts); ns = parent; } while (atomic_dec_and_test(&parent->count)); } void __put_user_ns(struct user_namespace *ns) { schedule_work(&ns->work); } EXPORT_SYMBOL(__put_user_ns); /** * idmap_key struct holds the information necessary to find an idmapping in a * sorted idmap array. It is passed to cmp_map_id() as first argument. */ struct idmap_key { bool map_up; /* true -> id from kid; false -> kid from id */ u32 id; /* id to find */ u32 count; /* == 0 unless used with map_id_range_down() */ }; /** * cmp_map_id - Function to be passed to bsearch() to find the requested * idmapping. Expects struct idmap_key to be passed via @k. */ static int cmp_map_id(const void *k, const void *e) { u32 first, last, id2; const struct idmap_key *key = k; const struct uid_gid_extent *el = e; id2 = key->id + key->count - 1; /* handle map_id_{down,up}() */ if (key->map_up) first = el->lower_first; else first = el->first; last = first + el->count - 1; if (key->id >= first && key->id <= last && (id2 >= first && id2 <= last)) return 0; if (key->id < first || id2 < first) return -1; return 1; } /** * map_id_range_down_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { struct idmap_key key; key.map_up = false; key.count = count; key.id = id; return bsearch(&key, map->forward, extents, sizeof(struct uid_gid_extent), cmp_map_id); } /** * map_id_range_down_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_down_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { unsigned idx; u32 first, last, id2; id2 = id + count - 1; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last && (id2 >= first && id2 <= last)) return &map->extent[idx]; } return NULL; } static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = map_id_range_down_base(extents, map, id, count); else extent = map_id_range_down_max(extents, map, id, count); /* Map the id or note failure */ if (extent) id = (id - extent->first) + extent->lower_first; else id = (u32) -1; return id; } static u32 map_id_down(struct uid_gid_map *map, u32 id) { return map_id_range_down(map, id, 1); } /** * map_id_up_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id) { unsigned idx; u32 first, last; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].lower_first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last) return &map->extent[idx]; } return NULL; } /** * map_id_up_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id) { struct idmap_key key; key.map_up = true; key.count = 1; key.id = id; return bsearch(&key, map->reverse, extents, sizeof(struct uid_gid_extent), cmp_map_id); } static u32 map_id_up(struct uid_gid_map *map, u32 id) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = map_id_up_base(extents, map, id); else extent = map_id_up_max(extents, map, id); /* Map the id or note failure */ if (extent) id = (id - extent->lower_first) + extent->first; else id = (u32) -1; return id; } /** * make_kuid - Map a user-namespace uid pair into a kuid. * @ns: User namespace that the uid is in * @uid: User identifier * * Maps a user-namespace uid pair into a kernel internal kuid, * and returns that kuid. * * When there is no mapping defined for the user-namespace uid * pair INVALID_UID is returned. Callers are expected to test * for and handle INVALID_UID being returned. INVALID_UID * may be tested for using uid_valid(). */ kuid_t make_kuid(struct user_namespace *ns, uid_t uid) { /* Map the uid to a global kernel uid */ return KUIDT_INIT(map_id_down(&ns->uid_map, uid)); } EXPORT_SYMBOL(make_kuid); /** * from_kuid - Create a uid from a kuid user-namespace pair. * @targ: The user namespace we want a uid in. * @kuid: The kernel internal uid to start with. * * Map @kuid into the user-namespace specified by @targ and * return the resulting uid. * * There is always a mapping into the initial user_namespace. * * If @kuid has no mapping in @targ (uid_t)-1 is returned. */ uid_t from_kuid(struct user_namespace *targ, kuid_t kuid) { /* Map the uid from a global kernel uid */ return map_id_up(&targ->uid_map, __kuid_val(kuid)); } EXPORT_SYMBOL(from_kuid); /** * from_kuid_munged - Create a uid from a kuid user-namespace pair. * @targ: The user namespace we want a uid in. * @kuid: The kernel internal uid to start with. * * Map @kuid into the user-namespace specified by @targ and * return the resulting uid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kuid from_kuid_munged never fails and always * returns a valid uid. This makes from_kuid_munged appropriate * for use in syscalls like stat and getuid where failing the * system call and failing to provide a valid uid are not an * options. * * If @kuid has no mapping in @targ overflowuid is returned. */ uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid) { uid_t uid; uid = from_kuid(targ, kuid); if (uid == (uid_t) -1) uid = overflowuid; return uid; } EXPORT_SYMBOL(from_kuid_munged); /** * make_kgid - Map a user-namespace gid pair into a kgid. * @ns: User namespace that the gid is in * @gid: group identifier * * Maps a user-namespace gid pair into a kernel internal kgid, * and returns that kgid. * * When there is no mapping defined for the user-namespace gid * pair INVALID_GID is returned. Callers are expected to test * for and handle INVALID_GID being returned. INVALID_GID may be * tested for using gid_valid(). */ kgid_t make_kgid(struct user_namespace *ns, gid_t gid) { /* Map the gid to a global kernel gid */ return KGIDT_INIT(map_id_down(&ns->gid_map, gid)); } EXPORT_SYMBOL(make_kgid); /** * from_kgid - Create a gid from a kgid user-namespace pair. * @targ: The user namespace we want a gid in. * @kgid: The kernel internal gid to start with. * * Map @kgid into the user-namespace specified by @targ and * return the resulting gid. * * There is always a mapping into the initial user_namespace. * * If @kgid has no mapping in @targ (gid_t)-1 is returned. */ gid_t from_kgid(struct user_namespace *targ, kgid_t kgid) { /* Map the gid from a global kernel gid */ return map_id_up(&targ->gid_map, __kgid_val(kgid)); } EXPORT_SYMBOL(from_kgid); /** * from_kgid_munged - Create a gid from a kgid user-namespace pair. * @targ: The user namespace we want a gid in. * @kgid: The kernel internal gid to start with. * * Map @kgid into the user-namespace specified by @targ and * return the resulting gid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kgid from_kgid_munged never fails and always * returns a valid gid. This makes from_kgid_munged appropriate * for use in syscalls like stat and getgid where failing the * system call and failing to provide a valid gid are not options. * * If @kgid has no mapping in @targ overflowgid is returned. */ gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid) { gid_t gid; gid = from_kgid(targ, kgid); if (gid == (gid_t) -1) gid = overflowgid; return gid; } EXPORT_SYMBOL(from_kgid_munged); /** * make_kprojid - Map a user-namespace projid pair into a kprojid. * @ns: User namespace that the projid is in * @projid: Project identifier * * Maps a user-namespace uid pair into a kernel internal kuid, * and returns that kuid. * * When there is no mapping defined for the user-namespace projid * pair INVALID_PROJID is returned. Callers are expected to test * for and handle handle INVALID_PROJID being returned. INVALID_PROJID * may be tested for using projid_valid(). */ kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid) { /* Map the uid to a global kernel uid */ return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid)); } EXPORT_SYMBOL(make_kprojid); /** * from_kprojid - Create a projid from a kprojid user-namespace pair. * @targ: The user namespace we want a projid in. * @kprojid: The kernel internal project identifier to start with. * * Map @kprojid into the user-namespace specified by @targ and * return the resulting projid. * * There is always a mapping into the initial user_namespace. * * If @kprojid has no mapping in @targ (projid_t)-1 is returned. */ projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid) { /* Map the uid from a global kernel uid */ return map_id_up(&targ->projid_map, __kprojid_val(kprojid)); } EXPORT_SYMBOL(from_kprojid); /** * from_kprojid_munged - Create a projiid from a kprojid user-namespace pair. * @targ: The user namespace we want a projid in. * @kprojid: The kernel internal projid to start with. * * Map @kprojid into the user-namespace specified by @targ and * return the resulting projid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kprojid from_kprojid_munged never fails and always * returns a valid projid. This makes from_kprojid_munged * appropriate for use in syscalls like stat and where * failing the system call and failing to provide a valid projid are * not an options. * * If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned. */ projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid) { projid_t projid; projid = from_kprojid(targ, kprojid); if (projid == (projid_t) -1) projid = OVERFLOW_PROJID; return projid; } EXPORT_SYMBOL(from_kprojid_munged); static int uid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; uid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static int gid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; gid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static int projid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; projid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) { loff_t pos = *ppos; unsigned extents = map->nr_extents; smp_rmb(); if (pos >= extents) return NULL; if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return &map->extent[pos]; return &map->forward[pos]; } static void *uid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->uid_map); } static void *gid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->gid_map); } static void *projid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->projid_map); } static void *m_next(struct seq_file *seq, void *v, loff_t *pos) { (*pos)++; return seq->op->start(seq, pos); } static void m_stop(struct seq_file *seq, void *v) { return; } const struct seq_operations proc_uid_seq_operations = { .start = uid_m_start, .stop = m_stop, .next = m_next, .show = uid_m_show, }; const struct seq_operations proc_gid_seq_operations = { .start = gid_m_start, .stop = m_stop, .next = m_next, .show = gid_m_show, }; const struct seq_operations proc_projid_seq_operations = { .start = projid_m_start, .stop = m_stop, .next = m_next, .show = projid_m_show, }; static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent) { u32 upper_first, lower_first, upper_last, lower_last; unsigned idx; upper_first = extent->first; lower_first = extent->lower_first; upper_last = upper_first + extent->count - 1; lower_last = lower_first + extent->count - 1; for (idx = 0; idx < new_map->nr_extents; idx++) { u32 prev_upper_first, prev_lower_first; u32 prev_upper_last, prev_lower_last; struct uid_gid_extent *prev; if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) prev = &new_map->extent[idx]; else prev = &new_map->forward[idx]; prev_upper_first = prev->first; prev_lower_first = prev->lower_first; prev_upper_last = prev_upper_first + prev->count - 1; prev_lower_last = prev_lower_first + prev->count - 1; /* Does the upper range intersect a previous extent? */ if ((prev_upper_first <= upper_last) && (prev_upper_last >= upper_first)) return true; /* Does the lower range intersect a previous extent? */ if ((prev_lower_first <= lower_last) && (prev_lower_last >= lower_first)) return true; } return false; } /** * insert_extent - Safely insert a new idmap extent into struct uid_gid_map. * Takes care to allocate a 4K block of memory if the number of mappings exceeds * UID_GID_MAP_MAX_BASE_EXTENTS. */ static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent) { struct uid_gid_extent *dest; if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) { struct uid_gid_extent *forward; /* Allocate memory for 340 mappings. */ forward = kmalloc_array(UID_GID_MAP_MAX_EXTENTS, sizeof(struct uid_gid_extent), GFP_KERNEL); if (!forward) return -ENOMEM; /* Copy over memory. Only set up memory for the forward pointer. * Defer the memory setup for the reverse pointer. */ memcpy(forward, map->extent, map->nr_extents * sizeof(map->extent[0])); map->forward = forward; map->reverse = NULL; } if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS) dest = &map->extent[map->nr_extents]; else dest = &map->forward[map->nr_extents]; *dest = *extent; map->nr_extents++; return 0; } /* cmp function to sort() forward mappings */ static int cmp_extents_forward(const void *a, const void *b) { const struct uid_gid_extent *e1 = a; const struct uid_gid_extent *e2 = b; if (e1->first < e2->first) return -1; if (e1->first > e2->first) return 1; return 0; } /* cmp function to sort() reverse mappings */ static int cmp_extents_reverse(const void *a, const void *b) { const struct uid_gid_extent *e1 = a; const struct uid_gid_extent *e2 = b; if (e1->lower_first < e2->lower_first) return -1; if (e1->lower_first > e2->lower_first) return 1; return 0; } /** * sort_idmaps - Sorts an array of idmap entries. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static int sort_idmaps(struct uid_gid_map *map) { if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return 0; /* Sort forward array. */ sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent), cmp_extents_forward, NULL); /* Only copy the memory from forward we actually need. */ map->reverse = kmemdup(map->forward, map->nr_extents * sizeof(struct uid_gid_extent), GFP_KERNEL); if (!map->reverse) return -ENOMEM; /* Sort reverse array. */ sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent), cmp_extents_reverse, NULL); return 0; } static ssize_t map_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int cap_setid, struct uid_gid_map *map, struct uid_gid_map *parent_map) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct uid_gid_map new_map; unsigned idx; struct uid_gid_extent extent; char *kbuf = NULL, *pos, *next_line; ssize_t ret; /* Only allow < page size writes at the beginning of the file */ if ((*ppos != 0) || (count >= PAGE_SIZE)) return -EINVAL; /* Slurp in the user data */ kbuf = memdup_user_nul(buf, count); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); /* * The userns_state_mutex serializes all writes to any given map. * * Any map is only ever written once. * * An id map fits within 1 cache line on most architectures. * * On read nothing needs to be done unless you are on an * architecture with a crazy cache coherency model like alpha. * * There is a one time data dependency between reading the * count of the extents and the values of the extents. The * desired behavior is to see the values of the extents that * were written before the count of the extents. * * To achieve this smp_wmb() is used on guarantee the write * order and smp_rmb() is guaranteed that we don't have crazy * architectures returning stale data. */ mutex_lock(&userns_state_mutex); memset(&new_map, 0, sizeof(struct uid_gid_map)); ret = -EPERM; /* Only allow one successful write to the map */ if (map->nr_extents != 0) goto out; /* * Adjusting namespace settings requires capabilities on the target. */ if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN)) goto out; /* Parse the user data */ ret = -EINVAL; pos = kbuf; for (; pos; pos = next_line) { /* Find the end of line and ensure I don't look past it */ next_line = strchr(pos, '\n'); if (next_line) { *next_line = '\0'; next_line++; if (*next_line == '\0') next_line = NULL; } pos = skip_spaces(pos); extent.first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); extent.lower_first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); extent.count = simple_strtoul(pos, &pos, 10); if (*pos && !isspace(*pos)) goto out; /* Verify there is not trailing junk on the line */ pos = skip_spaces(pos); if (*pos != '\0') goto out; /* Verify we have been given valid starting values */ if ((extent.first == (u32) -1) || (extent.lower_first == (u32) -1)) goto out; /* Verify count is not zero and does not cause the * extent to wrap */ if ((extent.first + extent.count) <= extent.first) goto out; if ((extent.lower_first + extent.count) <= extent.lower_first) goto out; /* Do the ranges in extent overlap any previous extents? */ if (mappings_overlap(&new_map, &extent)) goto out; if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS && (next_line != NULL)) goto out; ret = insert_extent(&new_map, &extent); if (ret < 0) goto out; ret = -EINVAL; } /* Be very certaint the new map actually exists */ if (new_map.nr_extents == 0) goto out; ret = -EPERM; /* Validate the user is allowed to use user id's mapped to. */ if (!new_idmap_permitted(file, ns, cap_setid, &new_map)) goto out; ret = -EPERM; /* Map the lower ids from the parent user namespace to the * kernel global id space. */ for (idx = 0; idx < new_map.nr_extents; idx++) { struct uid_gid_extent *e; u32 lower_first; if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) e = &new_map.extent[idx]; else e = &new_map.forward[idx]; lower_first = map_id_range_down(parent_map, e->lower_first, e->count); /* Fail if we can not map the specified extent to * the kernel global id space. */ if (lower_first == (u32) -1) goto out; e->lower_first = lower_first; } /* * If we want to use binary search for lookup, this clones the extent * array and sorts both copies. */ ret = sort_idmaps(&new_map); if (ret < 0) goto out; /* Install the map */ if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { memcpy(map->extent, new_map.extent, new_map.nr_extents * sizeof(new_map.extent[0])); } else { map->forward = new_map.forward; map->reverse = new_map.reverse; } smp_wmb(); map->nr_extents = new_map.nr_extents; *ppos = count; ret = count; out: if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(new_map.forward); kfree(new_map.reverse); map->forward = NULL; map->reverse = NULL; map->nr_extents = 0; } mutex_unlock(&userns_state_mutex); kfree(kbuf); return ret; } ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; return map_write(file, buf, size, ppos, CAP_SETUID, &ns->uid_map, &ns->parent->uid_map); } ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; return map_write(file, buf, size, ppos, CAP_SETGID, &ns->gid_map, &ns->parent->gid_map); } ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; /* Anyone can set any valid project id no capability needed */ return map_write(file, buf, size, ppos, -1, &ns->projid_map, &ns->parent->projid_map); } static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { const struct cred *cred = file->f_cred; /* Don't allow mappings that would allow anything that wouldn't * be allowed without the establishment of unprivileged mappings. */ if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) && uid_eq(ns->owner, cred->euid)) { u32 id = new_map->extent[0].lower_first; if (cap_setid == CAP_SETUID) { kuid_t uid = make_kuid(ns->parent, id); if (uid_eq(uid, cred->euid)) return true; } else if (cap_setid == CAP_SETGID) { kgid_t gid = make_kgid(ns->parent, id); if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) && gid_eq(gid, cred->egid)) return true; } } /* Allow anyone to set a mapping that doesn't require privilege */ if (!cap_valid(cap_setid)) return true; /* Allow the specified ids if we have the appropriate capability * (CAP_SETUID or CAP_SETGID) over the parent user namespace. * And the opener of the id file also had the approprpiate capability. */ if (ns_capable(ns->parent, cap_setid) && file_ns_capable(file, ns->parent, cap_setid)) return true; return false; } int proc_setgroups_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; unsigned long userns_flags = READ_ONCE(ns->flags); seq_printf(seq, "%s\n", (userns_flags & USERNS_SETGROUPS_ALLOWED) ? "allow" : "deny"); return 0; } ssize_t proc_setgroups_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; char kbuf[8], *pos; bool setgroups_allowed; ssize_t ret; /* Only allow a very narrow range of strings to be written */ ret = -EINVAL; if ((*ppos != 0) || (count >= sizeof(kbuf))) goto out; /* What was written? */ ret = -EFAULT; if (copy_from_user(kbuf, buf, count)) goto out; kbuf[count] = '\0'; pos = kbuf; /* What is being requested? */ ret = -EINVAL; if (strncmp(pos, "allow", 5) == 0) { pos += 5; setgroups_allowed = true; } else if (strncmp(pos, "deny", 4) == 0) { pos += 4; setgroups_allowed = false; } else goto out; /* Verify there is not trailing junk on the line */ pos = skip_spaces(pos); if (*pos != '\0') goto out; ret = -EPERM; mutex_lock(&userns_state_mutex); if (setgroups_allowed) { /* Enabling setgroups after setgroups has been disabled * is not allowed. */ if (!(ns->flags & USERNS_SETGROUPS_ALLOWED)) goto out_unlock; } else { /* Permanently disabling setgroups after setgroups has * been enabled by writing the gid_map is not allowed. */ if (ns->gid_map.nr_extents != 0) goto out_unlock; ns->flags &= ~USERNS_SETGROUPS_ALLOWED; } mutex_unlock(&userns_state_mutex); /* Report a successful write */ *ppos = count; ret = count; out: return ret; out_unlock: mutex_unlock(&userns_state_mutex); goto out; } bool userns_may_setgroups(const struct user_namespace *ns) { bool allowed; mutex_lock(&userns_state_mutex); /* It is not safe to use setgroups until a gid mapping in * the user namespace has been established. */ allowed = ns->gid_map.nr_extents != 0; /* Is setgroups allowed? */ allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED); mutex_unlock(&userns_state_mutex); return allowed; } /* * Returns true if @child is the same namespace or a descendant of * @ancestor. */ bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child) { const struct user_namespace *ns; for (ns = child; ns->level > ancestor->level; ns = ns->parent) ; return (ns == ancestor); } bool current_in_userns(const struct user_namespace *target_ns) { return in_userns(target_ns, current_user_ns()); } EXPORT_SYMBOL(current_in_userns); static inline struct user_namespace *to_user_ns(struct ns_common *ns) { return container_of(ns, struct user_namespace, ns); } static struct ns_common *userns_get(struct task_struct *task) { struct user_namespace *user_ns; rcu_read_lock(); user_ns = get_user_ns(__task_cred(task)->user_ns); rcu_read_unlock(); return user_ns ? &user_ns->ns : NULL; } static void userns_put(struct ns_common *ns) { put_user_ns(to_user_ns(ns)); } static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns) { struct user_namespace *user_ns = to_user_ns(ns); struct cred *cred; /* Don't allow gaining capabilities by reentering * the same user namespace. */ if (user_ns == current_user_ns()) return -EINVAL; /* Tasks that share a thread group must share a user namespace */ if (!thread_group_empty(current)) return -EINVAL; if (current->fs->users != 1) return -EINVAL; if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; cred = prepare_creds(); if (!cred) return -ENOMEM; put_user_ns(cred->user_ns); set_cred_user_ns(cred, get_user_ns(user_ns)); return commit_creds(cred); } struct ns_common *ns_get_owner(struct ns_common *ns) { struct user_namespace *my_user_ns = current_user_ns(); struct user_namespace *owner, *p; /* See if the owner is in the current user namespace */ owner = p = ns->ops->owner(ns); for (;;) { if (!p) return ERR_PTR(-EPERM); if (p == my_user_ns) break; p = p->parent; } return &get_user_ns(owner)->ns; } static struct user_namespace *userns_owner(struct ns_common *ns) { return to_user_ns(ns)->parent; } const struct proc_ns_operations userns_operations = { .name = "user", .type = CLONE_NEWUSER, .get = userns_get, .put = userns_put, .install = userns_install, .owner = userns_owner, .get_parent = ns_get_owner, }; static __init int user_namespaces_init(void) { user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); return 0; } subsys_initcall(user_namespaces_init); |
1223 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_RT_H #define _LINUX_SCHED_RT_H #include <linux/sched.h> struct task_struct; static inline int rt_prio(int prio) { if (unlikely(prio < MAX_RT_PRIO)) return 1; return 0; } static inline int rt_task(struct task_struct *p) { return rt_prio(p->prio); } static inline bool task_is_realtime(struct task_struct *tsk) { int policy = tsk->policy; if (policy == SCHED_FIFO || policy == SCHED_RR) return true; if (policy == SCHED_DEADLINE) return true; return false; } #ifdef CONFIG_RT_MUTEXES /* * Must hold either p->pi_lock or task_rq(p)->lock. */ static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *p) { return p->pi_top_task; } extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task); extern void rt_mutex_adjust_pi(struct task_struct *p); static inline bool tsk_is_pi_blocked(struct task_struct *tsk) { return tsk->pi_blocked_on != NULL; } #else static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task) { return NULL; } # define rt_mutex_adjust_pi(p) do { } while (0) static inline bool tsk_is_pi_blocked(struct task_struct *tsk) { return false; } #endif extern void normalize_rt_tasks(void); /* * default timeslice is 100 msecs (used only for SCHED_RR tasks). * Timeslices get refilled after they expire. */ #define RR_TIMESLICE (100 * HZ / 1000) #endif /* _LINUX_SCHED_RT_H */ |
7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 | /* Copyright (c) 2016 Facebook * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. */ #ifndef __BPF_LRU_LIST_H_ #define __BPF_LRU_LIST_H_ #include <linux/list.h> #include <linux/spinlock_types.h> #define NR_BPF_LRU_LIST_T (3) #define NR_BPF_LRU_LIST_COUNT (2) #define NR_BPF_LRU_LOCAL_LIST_T (2) #define BPF_LOCAL_LIST_T_OFFSET NR_BPF_LRU_LIST_T enum bpf_lru_list_type { BPF_LRU_LIST_T_ACTIVE, BPF_LRU_LIST_T_INACTIVE, BPF_LRU_LIST_T_FREE, BPF_LRU_LOCAL_LIST_T_FREE, BPF_LRU_LOCAL_LIST_T_PENDING, }; struct bpf_lru_node { struct list_head list; u16 cpu; u8 type; u8 ref; }; struct bpf_lru_list { struct list_head lists[NR_BPF_LRU_LIST_T]; unsigned int counts[NR_BPF_LRU_LIST_COUNT]; /* The next inacitve list rotation starts from here */ struct list_head *next_inactive_rotation; raw_spinlock_t lock ____cacheline_aligned_in_smp; }; struct bpf_lru_locallist { struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T]; u16 next_steal; raw_spinlock_t lock; }; struct bpf_common_lru { struct bpf_lru_list lru_list; struct bpf_lru_locallist __percpu *local_list; }; typedef bool (*del_from_htab_func)(void *arg, struct bpf_lru_node *node); struct bpf_lru { union { struct bpf_common_lru common_lru; struct bpf_lru_list __percpu *percpu_lru; }; del_from_htab_func del_from_htab; void *del_arg; unsigned int hash_offset; unsigned int nr_scans; bool percpu; }; static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node) { /* ref is an approximation on access frequency. It does not * have to be very accurate. Hence, no protection is used. */ if (!node->ref) node->ref = 1; } int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, del_from_htab_func del_from_htab, void *delete_arg); void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, u32 elem_size, u32 nr_elems); void bpf_lru_destroy(struct bpf_lru *lru); struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash); void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node); void bpf_lru_promote(struct bpf_lru *lru, struct bpf_lru_node *node); #endif |
9 14 14 14 5 3 3 1 1 5 11 9 9 6 11 14 14 1 12 14 12 11 11 11 2 9 9 5 4 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 | /* * net/sched/cls_fw.c Classifier mapping ipchains' fwmark to traffic class. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Changes: * Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_walk off by one * Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_delete killed all the filter (and kernel). * Alex <alex@pilotsoft.com> : 2004xxyy: Added Action extension * * JHS: We should remove the CONFIG_NET_CLS_IND from here * eventually when the meta match extension is made available * */ #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <net/netlink.h> #include <net/act_api.h> #include <net/pkt_cls.h> #include <net/sch_generic.h> #define HTSIZE 256 struct fw_head { u32 mask; struct fw_filter __rcu *ht[HTSIZE]; struct rcu_head rcu; }; struct fw_filter { struct fw_filter __rcu *next; u32 id; struct tcf_result res; #ifdef CONFIG_NET_CLS_IND int ifindex; #endif /* CONFIG_NET_CLS_IND */ struct tcf_exts exts; struct tcf_proto *tp; struct rcu_work rwork; }; static u32 fw_hash(u32 handle) { handle ^= (handle >> 16); handle ^= (handle >> 8); return handle % HTSIZE; } static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { struct fw_head *head = rcu_dereference_bh(tp->root); struct fw_filter *f; int r; u32 id = skb->mark; if (head != NULL) { id &= head->mask; for (f = rcu_dereference_bh(head->ht[fw_hash(id)]); f; f = rcu_dereference_bh(f->next)) { if (f->id == id) { *res = f->res; #ifdef CONFIG_NET_CLS_IND if (!tcf_match_indev(skb, f->ifindex)) continue; #endif /* CONFIG_NET_CLS_IND */ r = tcf_exts_exec(skb, &f->exts, res); if (r < 0) continue; return r; } } } else { struct Qdisc *q = tcf_block_q(tp->chain->block); /* Old method: classify the packet using its skb mark. */ if (id && (TC_H_MAJ(id) == 0 || !(TC_H_MAJ(id ^ q->handle)))) { res->classid = id; res->class = 0; return 0; } } return -1; } static void *fw_get(struct tcf_proto *tp, u32 handle) { struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f; if (head == NULL) return NULL; f = rtnl_dereference(head->ht[fw_hash(handle)]); for (; f; f = rtnl_dereference(f->next)) { if (f->id == handle) return f; } return NULL; } static int fw_init(struct tcf_proto *tp) { /* We don't allocate fw_head here, because in the old method * we don't need it at all. */ return 0; } static void __fw_delete_filter(struct fw_filter *f) { tcf_exts_destroy(&f->exts); tcf_exts_put_net(&f->exts); kfree(f); } static void fw_delete_filter_work(struct work_struct *work) { struct fw_filter *f = container_of(to_rcu_work(work), struct fw_filter, rwork); rtnl_lock(); __fw_delete_filter(f); rtnl_unlock(); } static void fw_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f; int h; if (head == NULL) return; for (h = 0; h < HTSIZE; h++) { while ((f = rtnl_dereference(head->ht[h])) != NULL) { RCU_INIT_POINTER(head->ht[h], rtnl_dereference(f->next)); tcf_unbind_filter(tp, &f->res); if (tcf_exts_get_net(&f->exts)) tcf_queue_work(&f->rwork, fw_delete_filter_work); else __fw_delete_filter(f); } } kfree_rcu(head, rcu); } static int fw_delete(struct tcf_proto *tp, void *arg, bool *last, struct netlink_ext_ack *extack) { struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f = arg; struct fw_filter __rcu **fp; struct fw_filter *pfp; int ret = -EINVAL; int h; if (head == NULL || f == NULL) goto out; fp = &head->ht[fw_hash(f->id)]; for (pfp = rtnl_dereference(*fp); pfp; fp = &pfp->next, pfp = rtnl_dereference(*fp)) { if (pfp == f) { RCU_INIT_POINTER(*fp, rtnl_dereference(f->next)); tcf_unbind_filter(tp, &f->res); tcf_exts_get_net(&f->exts); tcf_queue_work(&f->rwork, fw_delete_filter_work); ret = 0; break; } } *last = true; for (h = 0; h < HTSIZE; h++) { if (rcu_access_pointer(head->ht[h])) { *last = false; break; } } out: return ret; } static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = { [TCA_FW_CLASSID] = { .type = NLA_U32 }, [TCA_FW_INDEV] = { .type = NLA_STRING, .len = IFNAMSIZ }, [TCA_FW_MASK] = { .type = NLA_U32 }, }; static int fw_set_parms(struct net *net, struct tcf_proto *tp, struct fw_filter *f, struct nlattr **tb, struct nlattr **tca, unsigned long base, bool ovr, struct netlink_ext_ack *extack) { struct fw_head *head = rtnl_dereference(tp->root); u32 mask; int err; err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &f->exts, ovr, extack); if (err < 0) return err; if (tb[TCA_FW_CLASSID]) { f->res.classid = nla_get_u32(tb[TCA_FW_CLASSID]); tcf_bind_filter(tp, &f->res, base); } #ifdef CONFIG_NET_CLS_IND if (tb[TCA_FW_INDEV]) { int ret; ret = tcf_change_indev(net, tb[TCA_FW_INDEV], extack); if (ret < 0) return ret; f->ifindex = ret; } #endif /* CONFIG_NET_CLS_IND */ err = -EINVAL; if (tb[TCA_FW_MASK]) { mask = nla_get_u32(tb[TCA_FW_MASK]); if (mask != head->mask) return err; } else if (head->mask != 0xFFFFFFFF) return err; return 0; } static int fw_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, void **arg, bool ovr, struct netlink_ext_ack *extack) { struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f = *arg; struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_FW_MAX + 1]; int err; if (!opt) return handle ? -EINVAL : 0; /* Succeed if it is old method. */ err = nla_parse_nested(tb, TCA_FW_MAX, opt, fw_policy, NULL); if (err < 0) return err; if (f) { struct fw_filter *pfp, *fnew; struct fw_filter __rcu **fp; if (f->id != handle && handle) return -EINVAL; fnew = kzalloc(sizeof(struct fw_filter), GFP_KERNEL); if (!fnew) return -ENOBUFS; fnew->id = f->id; fnew->res = f->res; #ifdef CONFIG_NET_CLS_IND fnew->ifindex = f->ifindex; #endif /* CONFIG_NET_CLS_IND */ fnew->tp = f->tp; err = tcf_exts_init(&fnew->exts, TCA_FW_ACT, TCA_FW_POLICE); if (err < 0) { kfree(fnew); return err; } err = fw_set_parms(net, tp, fnew, tb, tca, base, ovr, extack); if (err < 0) { tcf_exts_destroy(&fnew->exts); kfree(fnew); return err; } fp = &head->ht[fw_hash(fnew->id)]; for (pfp = rtnl_dereference(*fp); pfp; fp = &pfp->next, pfp = rtnl_dereference(*fp)) if (pfp == f) break; RCU_INIT_POINTER(fnew->next, rtnl_dereference(pfp->next)); rcu_assign_pointer(*fp, fnew); tcf_unbind_filter(tp, &f->res); tcf_exts_get_net(&f->exts); tcf_queue_work(&f->rwork, fw_delete_filter_work); *arg = fnew; return err; } if (!handle) return -EINVAL; if (!head) { u32 mask = 0xFFFFFFFF; if (tb[TCA_FW_MASK]) mask = nla_get_u32(tb[TCA_FW_MASK]); head = kzalloc(sizeof(*head), GFP_KERNEL); if (!head) return -ENOBUFS; head->mask = mask; rcu_assign_pointer(tp->root, head); } f = kzalloc(sizeof(struct fw_filter), GFP_KERNEL); if (f == NULL) return -ENOBUFS; err = tcf_exts_init(&f->exts, TCA_FW_ACT, TCA_FW_POLICE); if (err < 0) goto errout; f->id = handle; f->tp = tp; err = fw_set_parms(net, tp, f, tb, tca, base, ovr, extack); if (err < 0) goto errout; RCU_INIT_POINTER(f->next, head->ht[fw_hash(handle)]); rcu_assign_pointer(head->ht[fw_hash(handle)], f); *arg = f; return 0; errout: tcf_exts_destroy(&f->exts); kfree(f); return err; } static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg) { struct fw_head *head = rtnl_dereference(tp->root); int h; if (head == NULL) arg->stop = 1; if (arg->stop) return; for (h = 0; h < HTSIZE; h++) { struct fw_filter *f; for (f = rtnl_dereference(head->ht[h]); f; f = rtnl_dereference(f->next)) { if (arg->count < arg->skip) { arg->count++; continue; } if (arg->fn(tp, f, arg) < 0) { arg->stop = 1; return; } arg->count++; } } } static int fw_dump(struct net *net, struct tcf_proto *tp, void *fh, struct sk_buff *skb, struct tcmsg *t) { struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f = fh; struct nlattr *nest; if (f == NULL) return skb->len; t->tcm_handle = f->id; if (!f->res.classid && !tcf_exts_has_actions(&f->exts)) return skb->len; nest = nla_nest_start(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; if (f->res.classid && nla_put_u32(skb, TCA_FW_CLASSID, f->res.classid)) goto nla_put_failure; #ifdef CONFIG_NET_CLS_IND if (f->ifindex) { struct net_device *dev; dev = __dev_get_by_index(net, f->ifindex); if (dev && nla_put_string(skb, TCA_FW_INDEV, dev->name)) goto nla_put_failure; } #endif /* CONFIG_NET_CLS_IND */ if (head->mask != 0xFFFFFFFF && nla_put_u32(skb, TCA_FW_MASK, head->mask)) goto nla_put_failure; if (tcf_exts_dump(skb, &f->exts) < 0) goto nla_put_failure; nla_nest_end(skb, nest); if (tcf_exts_dump_stats(skb, &f->exts) < 0) goto nla_put_failure; return skb->len; nla_put_failure: nla_nest_cancel(skb, nest); return -1; } static void fw_bind_class(void *fh, u32 classid, unsigned long cl, void *q, unsigned long base) { struct fw_filter *f = fh; if (f && f->res.classid == classid) { if (cl) __tcf_bind_filter(q, &f->res, base); else __tcf_unbind_filter(q, &f->res); } } static struct tcf_proto_ops cls_fw_ops __read_mostly = { .kind = "fw", .classify = fw_classify, .init = fw_init, .destroy = fw_destroy, .get = fw_get, .change = fw_change, .delete = fw_delete, .walk = fw_walk, .dump = fw_dump, .bind_class = fw_bind_class, .owner = THIS_MODULE, }; static int __init init_fw(void) { return register_tcf_proto_ops(&cls_fw_ops); } static void __exit exit_fw(void) { unregister_tcf_proto_ops(&cls_fw_ops); } module_init(init_fw) module_exit(exit_fw) MODULE_LICENSE("GPL"); |
1 6 29 29 20 28 5 5 6 1 1 1 6 1 1 1 1 1 2 2 2 2 2 2 2 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/minix/namei.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include "minix.h" static int add_nondir(struct dentry *dentry, struct inode *inode) { int err = minix_add_link(dentry, inode); if (!err) { d_instantiate(dentry, inode); return 0; } inode_dec_link_count(inode); iput(inode); return err; } static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, unsigned int flags) { struct inode * inode = NULL; ino_t ino; if (dentry->d_name.len > minix_sb(dir->i_sb)->s_namelen) return ERR_PTR(-ENAMETOOLONG); ino = minix_inode_by_name(dentry); if (ino) inode = minix_iget(dir->i_sb, ino); return d_splice_alias(inode, dentry); } static int minix_mknod(struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev) { int error; struct inode *inode; if (!old_valid_dev(rdev)) return -EINVAL; inode = minix_new_inode(dir, mode, &error); if (inode) { minix_set_inode(inode, rdev); mark_inode_dirty(inode); error = add_nondir(dentry, inode); } return error; } static int minix_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) { int error; struct inode *inode = minix_new_inode(dir, mode, &error); if (inode) { minix_set_inode(inode, 0); mark_inode_dirty(inode); d_tmpfile(dentry, inode); } return error; } static int minix_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return minix_mknod(dir, dentry, mode, 0); } static int minix_symlink(struct inode * dir, struct dentry *dentry, const char * symname) { int err = -ENAMETOOLONG; int i = strlen(symname)+1; struct inode * inode; if (i > dir->i_sb->s_blocksize) goto out; inode = minix_new_inode(dir, S_IFLNK | 0777, &err); if (!inode) goto out; minix_set_inode(inode, 0); err = page_symlink(inode, symname, i); if (err) goto out_fail; err = add_nondir(dentry, inode); out: return err; out_fail: inode_dec_link_count(inode); iput(inode); goto out; } static int minix_link(struct dentry * old_dentry, struct inode * dir, struct dentry *dentry) { struct inode *inode = d_inode(old_dentry); inode->i_ctime = current_time(inode); inode_inc_link_count(inode); ihold(inode); return add_nondir(dentry, inode); } static int minix_mkdir(struct inode * dir, struct dentry *dentry, umode_t mode) { struct inode * inode; int err; inode_inc_link_count(dir); inode = minix_new_inode(dir, S_IFDIR | mode, &err); if (!inode) goto out_dir; minix_set_inode(inode, 0); inode_inc_link_count(inode); err = minix_make_empty(inode, dir); if (err) goto out_fail; err = minix_add_link(dentry, inode); if (err) goto out_fail; d_instantiate(dentry, inode); out: return err; out_fail: inode_dec_link_count(inode); inode_dec_link_count(inode); iput(inode); out_dir: inode_dec_link_count(dir); goto out; } static int minix_unlink(struct inode * dir, struct dentry *dentry) { int err = -ENOENT; struct inode * inode = d_inode(dentry); struct page * page; struct minix_dir_entry * de; de = minix_find_entry(dentry, &page); if (!de) goto end_unlink; err = minix_delete_entry(de, page); if (err) goto end_unlink; inode->i_ctime = dir->i_ctime; inode_dec_link_count(inode); end_unlink: return err; } static int minix_rmdir(struct inode * dir, struct dentry *dentry) { struct inode * inode = d_inode(dentry); int err = -ENOTEMPTY; if (minix_empty_dir(inode)) { err = minix_unlink(dir, dentry); if (!err) { inode_dec_link_count(dir); inode_dec_link_count(inode); } } return err; } static int minix_rename(struct inode * old_dir, struct dentry *old_dentry, struct inode * new_dir, struct dentry *new_dentry, unsigned int flags) { struct inode * old_inode = d_inode(old_dentry); struct inode * new_inode = d_inode(new_dentry); struct page * dir_page = NULL; struct minix_dir_entry * dir_de = NULL; struct page * old_page; struct minix_dir_entry * old_de; int err = -ENOENT; if (flags & ~RENAME_NOREPLACE) return -EINVAL; old_de = minix_find_entry(old_dentry, &old_page); if (!old_de) goto out; if (S_ISDIR(old_inode->i_mode)) { err = -EIO; dir_de = minix_dotdot(old_inode, &dir_page); if (!dir_de) goto out_old; } if (new_inode) { struct page * new_page; struct minix_dir_entry * new_de; err = -ENOTEMPTY; if (dir_de && !minix_empty_dir(new_inode)) goto out_dir; err = -ENOENT; new_de = minix_find_entry(new_dentry, &new_page); if (!new_de) goto out_dir; minix_set_link(new_de, new_page, old_inode); new_inode->i_ctime = current_time(new_inode); if (dir_de) drop_nlink(new_inode); inode_dec_link_count(new_inode); } else { err = minix_add_link(new_dentry, old_inode); if (err) goto out_dir; if (dir_de) inode_inc_link_count(new_dir); } minix_delete_entry(old_de, old_page); mark_inode_dirty(old_inode); if (dir_de) { minix_set_link(dir_de, dir_page, new_dir); inode_dec_link_count(old_dir); } return 0; out_dir: if (dir_de) { kunmap(dir_page); put_page(dir_page); } out_old: kunmap(old_page); put_page(old_page); out: return err; } /* * directories can handle most operations... */ const struct inode_operations minix_dir_inode_operations = { .create = minix_create, .lookup = minix_lookup, .link = minix_link, .unlink = minix_unlink, .symlink = minix_symlink, .mkdir = minix_mkdir, .rmdir = minix_rmdir, .mknod = minix_mknod, .rename = minix_rename, .getattr = minix_getattr, .tmpfile = minix_tmpfile, }; |
58 58 1426 10 10 10 10 10 10 10 10 10 10 3 10 201 259 9 258 763 931 930 931 931 931 929 48 48 296 765 763 764 456 5 456 454 11 11 11 11 11 11 11 11 11 11 11 11 11 83 58 57 456 456 456 454 1 1 456 456 11 456 456 158 159 4 4 454 454 456 4 18 7 6 6 18 18 16 16 16 16 1 1 15 18 8 4 4 4 4 4 8 764 763 80 80 3 78 763 3 2 3 3 61 61 61 61 61 9 7 4 4 193 192 193 193 193 193 414 414 414 193 193 377 5 377 409 408 192 376 414 4 73 52 72 72 12 12 5 4 64 2 61 33 32 29 4 37 11 12 39 1 36 15 7 1 7 3 2 7 4 4 3 39 1 69 73 22 22 22 356 356 356 371 371 371 371 661 31 31 31 19 12 31 408 1425 1426 210 1426 1426 513 512 512 28 3 14 2 10 8 5 7 6 4 3 2 2 20 19 18 14 13 16 15 14 13 5 4 8 7 7 3 3 6 5 6 5 16 16 16 16 16 16 16 16 16 16 16 16 15 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 3 16 16 16 15 5 3 2 2 2 2 5 17 8 17 17 16 16 16 16 16 16 17 77 77 77 69 69 2 77 77 77 15 16 2 1 16 18 13 10 3 16 16 16 16 16 16 14 16 16 23 5 5 5 5 5 23 18 23 22 23 23 23 17 21 23 7 7 7 7 7 7 7 6 7 2 1 2 2 2 1 2 1 2 2 3 3 1 3 1 1 1 1 7 7 7 7 7 2 2 2 2 7 7 63 63 63 1419 1419 1418 1419 1304 1396 1419 1412 1412 515 514 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 | /* * Generic address resolution entity * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * * Fixes: * Vitaly E. Lavrov releasing NULL neighbor in neigh_add. * Harald Welte Add neighbour cache statistics like rtstat */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/slab.h> #include <linux/kmemleak.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/socket.h> #include <linux/netdevice.h> #include <linux/proc_fs.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <linux/times.h> #include <net/net_namespace.h> #include <net/neighbour.h> #include <net/arp.h> #include <net/dst.h> #include <net/sock.h> #include <net/netevent.h> #include <net/netlink.h> #include <linux/rtnetlink.h> #include <linux/random.h> #include <linux/string.h> #include <linux/log2.h> #include <linux/inetdevice.h> #include <net/addrconf.h> #define DEBUG #define NEIGH_DEBUG 1 #define neigh_dbg(level, fmt, ...) \ do { \ if (level <= NEIGH_DEBUG) \ pr_debug(fmt, ##__VA_ARGS__); \ } while (0) #define PNEIGH_HASHMASK 0xF static void neigh_timer_handler(struct timer_list *t); static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid); static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid); static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, struct net_device *dev); #ifdef CONFIG_PROC_FS static const struct seq_operations neigh_stat_seq_ops; #endif /* Neighbour hash table buckets are protected with rwlock tbl->lock. - All the scans/updates to hash buckets MUST be made under this lock. - NOTHING clever should be made under this lock: no callbacks to protocol backends, no attempts to send something to network. It will result in deadlocks, if backend/driver wants to use neighbour cache. - If the entry requires some non-trivial actions, increase its reference count and release table lock. Neighbour entries are protected: - with reference count. - with rwlock neigh->lock Reference count prevents destruction. neigh->lock mainly serializes ll address data and its validity state. However, the same lock is used to protect another entry fields: - timer - resolution queue Again, nothing clever shall be made under neigh->lock, the most complicated procedure, which we allow is dev->hard_header. It is supposed, that dev->hard_header is simplistic and does not make callbacks to neighbour tables. */ static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb) { kfree_skb(skb); return -ENETDOWN; } static void neigh_cleanup_and_release(struct neighbour *neigh) { if (neigh->parms->neigh_cleanup) neigh->parms->neigh_cleanup(neigh); __neigh_notify(neigh, RTM_DELNEIGH, 0, 0); call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); neigh_release(neigh); } /* * It is random distribution in the interval (1/2)*base...(3/2)*base. * It corresponds to default IPv6 settings and is not overridable, * because it is really reasonable choice. */ unsigned long neigh_rand_reach_time(unsigned long base) { return base ? (prandom_u32() % base) + (base >> 1) : 0; } EXPORT_SYMBOL(neigh_rand_reach_time); static bool neigh_del(struct neighbour *n, __u8 state, __u8 flags, struct neighbour __rcu **np, struct neigh_table *tbl) { bool retval = false; write_lock(&n->lock); if (refcount_read(&n->refcnt) == 1 && !(n->nud_state & state) && !(n->flags & flags)) { struct neighbour *neigh; neigh = rcu_dereference_protected(n->next, lockdep_is_held(&tbl->lock)); rcu_assign_pointer(*np, neigh); n->dead = 1; retval = true; } write_unlock(&n->lock); if (retval) neigh_cleanup_and_release(n); return retval; } bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl) { struct neigh_hash_table *nht; void *pkey = ndel->primary_key; u32 hash_val; struct neighbour *n; struct neighbour __rcu **np; nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); hash_val = tbl->hash(pkey, ndel->dev, nht->hash_rnd); hash_val = hash_val >> (32 - nht->hash_shift); np = &nht->hash_buckets[hash_val]; while ((n = rcu_dereference_protected(*np, lockdep_is_held(&tbl->lock)))) { if (n == ndel) return neigh_del(n, 0, 0, np, tbl); np = &n->next; } return false; } static int neigh_forced_gc(struct neigh_table *tbl) { int shrunk = 0; int i; struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); write_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); for (i = 0; i < (1 << nht->hash_shift); i++) { struct neighbour *n; struct neighbour __rcu **np; np = &nht->hash_buckets[i]; while ((n = rcu_dereference_protected(*np, lockdep_is_held(&tbl->lock))) != NULL) { /* Neighbour record may be discarded if: * - nobody refers to it. * - it is not permanent */ if (neigh_del(n, NUD_PERMANENT, NTF_EXT_LEARNED, np, tbl)) { shrunk = 1; continue; } np = &n->next; } } tbl->last_flush = jiffies; write_unlock_bh(&tbl->lock); return shrunk; } static void neigh_add_timer(struct neighbour *n, unsigned long when) { neigh_hold(n); if (unlikely(mod_timer(&n->timer, when))) { printk("NEIGH: BUG, double timer add, state is %x\n", n->nud_state); dump_stack(); } } static int neigh_del_timer(struct neighbour *n) { if ((n->nud_state & NUD_IN_TIMER) && del_timer(&n->timer)) { neigh_release(n); return 1; } return 0; } static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net) { struct sk_buff_head tmp; unsigned long flags; struct sk_buff *skb; skb_queue_head_init(&tmp); spin_lock_irqsave(&list->lock, flags); skb = skb_peek(list); while (skb != NULL) { struct sk_buff *skb_next = skb_peek_next(skb, list); if (net == NULL || net_eq(dev_net(skb->dev), net)) { __skb_unlink(skb, list); __skb_queue_tail(&tmp, skb); } skb = skb_next; } spin_unlock_irqrestore(&list->lock, flags); while ((skb = __skb_dequeue(&tmp))) { dev_put(skb->dev); kfree_skb(skb); } } static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev) { int i; struct neigh_hash_table *nht; nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); for (i = 0; i < (1 << nht->hash_shift); i++) { struct neighbour *n; struct neighbour __rcu **np = &nht->hash_buckets[i]; while ((n = rcu_dereference_protected(*np, lockdep_is_held(&tbl->lock))) != NULL) { if (dev && n->dev != dev) { np = &n->next; continue; } rcu_assign_pointer(*np, rcu_dereference_protected(n->next, lockdep_is_held(&tbl->lock))); write_lock(&n->lock); neigh_del_timer(n); n->dead = 1; if (refcount_read(&n->refcnt) != 1) { /* The most unpleasant situation. We must destroy neighbour entry, but someone still uses it. The destroy will be delayed until the last user releases us, but we must kill timers etc. and move it to safe state. */ __skb_queue_purge(&n->arp_queue); n->arp_queue_len_bytes = 0; n->output = neigh_blackhole; if (n->nud_state & NUD_VALID) n->nud_state = NUD_NOARP; else n->nud_state = NUD_NONE; neigh_dbg(2, "neigh %p is stray\n", n); } write_unlock(&n->lock); neigh_cleanup_and_release(n); } } } void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev) { write_lock_bh(&tbl->lock); neigh_flush_dev(tbl, dev); write_unlock_bh(&tbl->lock); } EXPORT_SYMBOL(neigh_changeaddr); int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) { write_lock_bh(&tbl->lock); neigh_flush_dev(tbl, dev); pneigh_ifdown_and_unlock(tbl, dev); pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL); if (skb_queue_empty_lockless(&tbl->proxy_queue)) del_timer_sync(&tbl->proxy_timer); return 0; } EXPORT_SYMBOL(neigh_ifdown); static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev) { struct neighbour *n = NULL; unsigned long now = jiffies; int entries; entries = atomic_inc_return(&tbl->entries) - 1; if (entries >= tbl->gc_thresh3 || (entries >= tbl->gc_thresh2 && time_after(now, tbl->last_flush + 5 * HZ))) { if (!neigh_forced_gc(tbl) && entries >= tbl->gc_thresh3) { net_info_ratelimited("%s: neighbor table overflow!\n", tbl->id); NEIGH_CACHE_STAT_INC(tbl, table_fulls); goto out_entries; } } n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC); if (!n) goto out_entries; __skb_queue_head_init(&n->arp_queue); rwlock_init(&n->lock); seqlock_init(&n->ha_lock); n->updated = n->used = now; n->nud_state = NUD_NONE; n->output = neigh_blackhole; seqlock_init(&n->hh.hh_lock); n->parms = neigh_parms_clone(&tbl->parms); timer_setup(&n->timer, neigh_timer_handler, 0); NEIGH_CACHE_STAT_INC(tbl, allocs); n->tbl = tbl; refcount_set(&n->refcnt, 1); n->dead = 1; out: return n; out_entries: atomic_dec(&tbl->entries); goto out; } static void neigh_get_hash_rnd(u32 *x) { *x = get_random_u32() | 1; } static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) { size_t size = (1 << shift) * sizeof(struct neighbour *); struct neigh_hash_table *ret; struct neighbour __rcu **buckets; int i; ret = kmalloc(sizeof(*ret), GFP_ATOMIC); if (!ret) return NULL; if (size <= PAGE_SIZE) { buckets = kzalloc(size, GFP_ATOMIC); } else { buckets = (struct neighbour __rcu **) __get_free_pages(GFP_ATOMIC | __GFP_ZERO, get_order(size)); kmemleak_alloc(buckets, size, 1, GFP_ATOMIC); } if (!buckets) { kfree(ret); return NULL; } ret->hash_buckets = buckets; ret->hash_shift = shift; for (i = 0; i < NEIGH_NUM_HASH_RND; i++) neigh_get_hash_rnd(&ret->hash_rnd[i]); return ret; } static void neigh_hash_free_rcu(struct rcu_head *head) { struct neigh_hash_table *nht = container_of(head, struct neigh_hash_table, rcu); size_t size = (1 << nht->hash_shift) * sizeof(struct neighbour *); struct neighbour __rcu **buckets = nht->hash_buckets; if (size <= PAGE_SIZE) { kfree(buckets); } else { kmemleak_free(buckets); free_pages((unsigned long)buckets, get_order(size)); } kfree(nht); } static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl, unsigned long new_shift) { unsigned int i, hash; struct neigh_hash_table *new_nht, *old_nht; NEIGH_CACHE_STAT_INC(tbl, hash_grows); old_nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); new_nht = neigh_hash_alloc(new_shift); if (!new_nht) return old_nht; for (i = 0; i < (1 << old_nht->hash_shift); i++) { struct neighbour *n, *next; for (n = rcu_dereference_protected(old_nht->hash_buckets[i], lockdep_is_held(&tbl->lock)); n != NULL; n = next) { hash = tbl->hash(n->primary_key, n->dev, new_nht->hash_rnd); hash >>= (32 - new_nht->hash_shift); next = rcu_dereference_protected(n->next, lockdep_is_held(&tbl->lock)); rcu_assign_pointer(n->next, rcu_dereference_protected( new_nht->hash_buckets[hash], lockdep_is_held(&tbl->lock))); rcu_assign_pointer(new_nht->hash_buckets[hash], n); } } rcu_assign_pointer(tbl->nht, new_nht); call_rcu(&old_nht->rcu, neigh_hash_free_rcu); return new_nht; } struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev) { struct neighbour *n; NEIGH_CACHE_STAT_INC(tbl, lookups); rcu_read_lock_bh(); n = __neigh_lookup_noref(tbl, pkey, dev); if (n) { if (!refcount_inc_not_zero(&n->refcnt)) n = NULL; NEIGH_CACHE_STAT_INC(tbl, hits); } rcu_read_unlock_bh(); return n; } EXPORT_SYMBOL(neigh_lookup); struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, const void *pkey) { struct neighbour *n; unsigned int key_len = tbl->key_len; u32 hash_val; struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, lookups); rcu_read_lock_bh(); nht = rcu_dereference_bh(tbl->nht); hash_val = tbl->hash(pkey, NULL, nht->hash_rnd) >> (32 - nht->hash_shift); for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]); n != NULL; n = rcu_dereference_bh(n->next)) { if (!memcmp(n->primary_key, pkey, key_len) && net_eq(dev_net(n->dev), net)) { if (!refcount_inc_not_zero(&n->refcnt)) n = NULL; NEIGH_CACHE_STAT_INC(tbl, hits); break; } } rcu_read_unlock_bh(); return n; } EXPORT_SYMBOL(neigh_lookup_nodev); struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, bool want_ref) { u32 hash_val; unsigned int key_len = tbl->key_len; int error; struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev); struct neigh_hash_table *nht; if (!n) { rc = ERR_PTR(-ENOBUFS); goto out; } memcpy(n->primary_key, pkey, key_len); n->dev = dev; dev_hold(dev); /* Protocol specific setup. */ if (tbl->constructor && (error = tbl->constructor(n)) < 0) { rc = ERR_PTR(error); goto out_neigh_release; } if (dev->netdev_ops->ndo_neigh_construct) { error = dev->netdev_ops->ndo_neigh_construct(dev, n); if (error < 0) { rc = ERR_PTR(error); goto out_neigh_release; } } /* Device specific setup. */ if (n->parms->neigh_setup && (error = n->parms->neigh_setup(n)) < 0) { rc = ERR_PTR(error); goto out_neigh_release; } n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1); write_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); if (atomic_read(&tbl->entries) > (1 << nht->hash_shift)) nht = neigh_hash_grow(tbl, nht->hash_shift + 1); hash_val = tbl->hash(n->primary_key, dev, nht->hash_rnd) >> (32 - nht->hash_shift); if (n->parms->dead) { rc = ERR_PTR(-EINVAL); goto out_tbl_unlock; } for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val], lockdep_is_held(&tbl->lock)); n1 != NULL; n1 = rcu_dereference_protected(n1->next, lockdep_is_held(&tbl->lock))) { if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) { if (want_ref) neigh_hold(n1); rc = n1; goto out_tbl_unlock; } } n->dead = 0; if (want_ref) neigh_hold(n); rcu_assign_pointer(n->next, rcu_dereference_protected(nht->hash_buckets[hash_val], lockdep_is_held(&tbl->lock))); rcu_assign_pointer(nht->hash_buckets[hash_val], n); write_unlock_bh(&tbl->lock); neigh_dbg(2, "neigh %p is created\n", n); rc = n; out: return rc; out_tbl_unlock: write_unlock_bh(&tbl->lock); out_neigh_release: neigh_release(n); goto out; } EXPORT_SYMBOL(__neigh_create); static u32 pneigh_hash(const void *pkey, unsigned int key_len) { u32 hash_val = *(u32 *)(pkey + key_len - 4); hash_val ^= (hash_val >> 16); hash_val ^= hash_val >> 8; hash_val ^= hash_val >> 4; hash_val &= PNEIGH_HASHMASK; return hash_val; } static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n, struct net *net, const void *pkey, unsigned int key_len, struct net_device *dev) { while (n) { if (!memcmp(n->key, pkey, key_len) && net_eq(pneigh_net(n), net) && (n->dev == dev || !n->dev)) return n; n = n->next; } return NULL; } struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) { unsigned int key_len = tbl->key_len; u32 hash_val = pneigh_hash(pkey, key_len); return __pneigh_lookup_1(tbl->phash_buckets[hash_val], net, pkey, key_len, dev); } EXPORT_SYMBOL_GPL(__pneigh_lookup); struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev, int creat) { struct pneigh_entry *n; unsigned int key_len = tbl->key_len; u32 hash_val = pneigh_hash(pkey, key_len); read_lock_bh(&tbl->lock); n = __pneigh_lookup_1(tbl->phash_buckets[hash_val], net, pkey, key_len, dev); read_unlock_bh(&tbl->lock); if (n || !creat) goto out; ASSERT_RTNL(); n = kzalloc(sizeof(*n) + key_len, GFP_KERNEL); if (!n) goto out; write_pnet(&n->net, net); memcpy(n->key, pkey, key_len); n->dev = dev; if (dev) dev_hold(dev); if (tbl->pconstructor && tbl->pconstructor(n)) { if (dev) dev_put(dev); kfree(n); n = NULL; goto out; } write_lock_bh(&tbl->lock); n->next = tbl->phash_buckets[hash_val]; tbl->phash_buckets[hash_val] = n; write_unlock_bh(&tbl->lock); out: return n; } EXPORT_SYMBOL(pneigh_lookup); int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) { struct pneigh_entry *n, **np; unsigned int key_len = tbl->key_len; u32 hash_val = pneigh_hash(pkey, key_len); write_lock_bh(&tbl->lock); for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL; np = &n->next) { if (!memcmp(n->key, pkey, key_len) && n->dev == dev && net_eq(pneigh_net(n), net)) { *np = n->next; write_unlock_bh(&tbl->lock); if (tbl->pdestructor) tbl->pdestructor(n); if (n->dev) dev_put(n->dev); kfree(n); return 0; } } write_unlock_bh(&tbl->lock); return -ENOENT; } static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, struct net_device *dev) { struct pneigh_entry *n, **np, *freelist = NULL; u32 h; for (h = 0; h <= PNEIGH_HASHMASK; h++) { np = &tbl->phash_buckets[h]; while ((n = *np) != NULL) { if (!dev || n->dev == dev) { *np = n->next; n->next = freelist; freelist = n; continue; } np = &n->next; } } write_unlock_bh(&tbl->lock); while ((n = freelist)) { freelist = n->next; n->next = NULL; if (tbl->pdestructor) tbl->pdestructor(n); if (n->dev) dev_put(n->dev); kfree(n); } return -ENOENT; } static void neigh_parms_destroy(struct neigh_parms *parms); static inline void neigh_parms_put(struct neigh_parms *parms) { if (refcount_dec_and_test(&parms->refcnt)) neigh_parms_destroy(parms); } /* * neighbour must already be out of the table; * */ void neigh_destroy(struct neighbour *neigh) { struct net_device *dev = neigh->dev; NEIGH_CACHE_STAT_INC(neigh->tbl, destroys); if (!neigh->dead) { pr_warn("Destroying alive neighbour %p\n", neigh); dump_stack(); return; } if (neigh_del_timer(neigh)) pr_warn("Impossible event\n"); write_lock_bh(&neigh->lock); __skb_queue_purge(&neigh->arp_queue); write_unlock_bh(&neigh->lock); neigh->arp_queue_len_bytes = 0; if (dev->netdev_ops->ndo_neigh_destroy) dev->netdev_ops->ndo_neigh_destroy(dev, neigh); dev_put(dev); neigh_parms_put(neigh->parms); neigh_dbg(2, "neigh %p is destroyed\n", neigh); atomic_dec(&neigh->tbl->entries); kfree_rcu(neigh, rcu); } EXPORT_SYMBOL(neigh_destroy); /* Neighbour state is suspicious; disable fast path. Called with write_locked neigh. */ static void neigh_suspect(struct neighbour *neigh) { neigh_dbg(2, "neigh %p is suspected\n", neigh); neigh->output = neigh->ops->output; } /* Neighbour state is OK; enable fast path. Called with write_locked neigh. */ static void neigh_connect(struct neighbour *neigh) { neigh_dbg(2, "neigh %p is connected\n", neigh); neigh->output = neigh->ops->connected_output; } static void neigh_periodic_work(struct work_struct *work) { struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work); struct neighbour *n; struct neighbour __rcu **np; unsigned int i; struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs); write_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); /* * periodically recompute ReachableTime from random function */ if (time_after(jiffies, tbl->last_rand + 300 * HZ)) { struct neigh_parms *p; tbl->last_rand = jiffies; list_for_each_entry(p, &tbl->parms_list, list) p->reachable_time = neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); } if (atomic_read(&tbl->entries) < tbl->gc_thresh1) goto out; for (i = 0 ; i < (1 << nht->hash_shift); i++) { np = &nht->hash_buckets[i]; while ((n = rcu_dereference_protected(*np, lockdep_is_held(&tbl->lock))) != NULL) { unsigned int state; write_lock(&n->lock); state = n->nud_state; if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) || (n->flags & NTF_EXT_LEARNED)) { write_unlock(&n->lock); goto next_elt; } if (time_before(n->used, n->confirmed)) n->used = n->confirmed; if (refcount_read(&n->refcnt) == 1 && (state == NUD_FAILED || time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { *np = n->next; n->dead = 1; write_unlock(&n->lock); neigh_cleanup_and_release(n); continue; } write_unlock(&n->lock); next_elt: np = &n->next; } /* * It's fine to release lock here, even if hash table * grows while we are preempted. */ write_unlock_bh(&tbl->lock); cond_resched(); write_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); } out: /* Cycle through all hash buckets every BASE_REACHABLE_TIME/2 ticks. * ARP entry timeouts range from 1/2 BASE_REACHABLE_TIME to 3/2 * BASE_REACHABLE_TIME. */ queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME) >> 1); write_unlock_bh(&tbl->lock); } static __inline__ int neigh_max_probes(struct neighbour *n) { struct neigh_parms *p = n->parms; return NEIGH_VAR(p, UCAST_PROBES) + NEIGH_VAR(p, APP_PROBES) + (n->nud_state & NUD_PROBE ? NEIGH_VAR(p, MCAST_REPROBES) : NEIGH_VAR(p, MCAST_PROBES)); } static void neigh_invalidate(struct neighbour *neigh) __releases(neigh->lock) __acquires(neigh->lock) { struct sk_buff *skb; NEIGH_CACHE_STAT_INC(neigh->tbl, res_failed); neigh_dbg(2, "neigh %p is failed\n", neigh); neigh->updated = jiffies; /* It is very thin place. report_unreachable is very complicated routine. Particularly, it can hit the same neighbour entry! So that, we try to be accurate and avoid dead loop. --ANK */ while (neigh->nud_state == NUD_FAILED && (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { write_unlock(&neigh->lock); neigh->ops->error_report(neigh, skb); write_lock(&neigh->lock); } __skb_queue_purge(&neigh->arp_queue); neigh->arp_queue_len_bytes = 0; } static void neigh_probe(struct neighbour *neigh) __releases(neigh->lock) { struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue); /* keep skb alive even if arp_queue overflows */ if (skb) skb = skb_clone(skb, GFP_ATOMIC); write_unlock(&neigh->lock); if (neigh->ops->solicit) neigh->ops->solicit(neigh, skb); atomic_inc(&neigh->probes); kfree_skb(skb); } /* Called when a timer expires for a neighbour entry. */ static void neigh_timer_handler(struct timer_list *t) { unsigned long now, next; struct neighbour *neigh = from_timer(neigh, t, timer); unsigned int state; int notify = 0; write_lock(&neigh->lock); state = neigh->nud_state; now = jiffies; next = now + HZ; if (!(state & NUD_IN_TIMER)) goto out; if (state & NUD_REACHABLE) { if (time_before_eq(now, neigh->confirmed + neigh->parms->reachable_time)) { neigh_dbg(2, "neigh %p is still alive\n", neigh); next = neigh->confirmed + neigh->parms->reachable_time; } else if (time_before_eq(now, neigh->used + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) { neigh_dbg(2, "neigh %p is delayed\n", neigh); neigh->nud_state = NUD_DELAY; neigh->updated = jiffies; neigh_suspect(neigh); next = now + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME); } else { neigh_dbg(2, "neigh %p is suspected\n", neigh); neigh->nud_state = NUD_STALE; neigh->updated = jiffies; neigh_suspect(neigh); notify = 1; } } else if (state & NUD_DELAY) { if (time_before_eq(now, neigh->confirmed + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) { neigh_dbg(2, "neigh %p is now reachable\n", neigh); neigh->nud_state = NUD_REACHABLE; neigh->updated = jiffies; neigh_connect(neigh); notify = 1; next = neigh->confirmed + neigh->parms->reachable_time; } else { neigh_dbg(2, "neigh %p is probed\n", neigh); neigh->nud_state = NUD_PROBE; neigh->updated = jiffies; atomic_set(&neigh->probes, 0); notify = 1; next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME); } } else { /* NUD_PROBE|NUD_INCOMPLETE */ next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME); } if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) && atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) { neigh->nud_state = NUD_FAILED; notify = 1; neigh_invalidate(neigh); goto out; } if (neigh->nud_state & NUD_IN_TIMER) { if (time_before(next, jiffies + HZ/2)) next = jiffies + HZ/2; if (!mod_timer(&neigh->timer, next)) neigh_hold(neigh); } if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) { neigh_probe(neigh); } else { out: write_unlock(&neigh->lock); } if (notify) neigh_update_notify(neigh, 0); neigh_release(neigh); } int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) { int rc; bool immediate_probe = false; write_lock_bh(&neigh->lock); rc = 0; if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE)) goto out_unlock_bh; if (neigh->dead) goto out_dead; if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) { if (NEIGH_VAR(neigh->parms, MCAST_PROBES) + NEIGH_VAR(neigh->parms, APP_PROBES)) { unsigned long next, now = jiffies; atomic_set(&neigh->probes, NEIGH_VAR(neigh->parms, UCAST_PROBES)); neigh_del_timer(neigh); neigh->nud_state = NUD_INCOMPLETE; neigh->updated = now; next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/2); neigh_add_timer(neigh, next); immediate_probe = true; } else { neigh->nud_state = NUD_FAILED; neigh->updated = jiffies; write_unlock_bh(&neigh->lock); kfree_skb(skb); return 1; } } else if (neigh->nud_state & NUD_STALE) { neigh_dbg(2, "neigh %p is delayed\n", neigh); neigh_del_timer(neigh); neigh->nud_state = NUD_DELAY; neigh->updated = jiffies; neigh_add_timer(neigh, jiffies + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME)); } if (neigh->nud_state == NUD_INCOMPLETE) { if (skb) { while (neigh->arp_queue_len_bytes + skb->truesize > NEIGH_VAR(neigh->parms, QUEUE_LEN_BYTES)) { struct sk_buff *buff; buff = __skb_dequeue(&neigh->arp_queue); if (!buff) break; neigh->arp_queue_len_bytes -= buff->truesize; kfree_skb(buff); NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards); } skb_dst_force(skb); __skb_queue_tail(&neigh->arp_queue, skb); neigh->arp_queue_len_bytes += skb->truesize; } rc = 1; } out_unlock_bh: if (immediate_probe) neigh_probe(neigh); else write_unlock(&neigh->lock); local_bh_enable(); return rc; out_dead: if (neigh->nud_state & NUD_STALE) goto out_unlock_bh; write_unlock_bh(&neigh->lock); kfree_skb(skb); return 1; } EXPORT_SYMBOL(__neigh_event_send); static void neigh_update_hhs(struct neighbour *neigh) { struct hh_cache *hh; void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *) = NULL; if (neigh->dev->header_ops) update = neigh->dev->header_ops->cache_update; if (update) { hh = &neigh->hh; if (READ_ONCE(hh->hh_len)) { write_seqlock_bh(&hh->hh_lock); update(hh, neigh->dev, neigh->ha); write_sequnlock_bh(&hh->hh_lock); } } } /* Generic update routine. -- lladdr is new lladdr or NULL, if it is not supplied. -- new is new state. -- flags NEIGH_UPDATE_F_OVERRIDE allows to override existing lladdr, if it is different. NEIGH_UPDATE_F_WEAK_OVERRIDE will suspect existing "connected" lladdr instead of overriding it if it is different. NEIGH_UPDATE_F_ADMIN means that the change is administrative. NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing NTF_ROUTER flag. NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as a router. Caller MUST hold reference count on the entry. */ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid) { u8 old; int err; int notify = 0; struct net_device *dev; int update_isrouter = 0; write_lock_bh(&neigh->lock); dev = neigh->dev; old = neigh->nud_state; err = -EPERM; if (!(flags & NEIGH_UPDATE_F_ADMIN) && (old & (NUD_NOARP | NUD_PERMANENT))) goto out; if (neigh->dead) goto out; neigh_update_ext_learned(neigh, flags, ¬ify); if (!(new & NUD_VALID)) { neigh_del_timer(neigh); if (old & NUD_CONNECTED) neigh_suspect(neigh); neigh->nud_state = new; err = 0; notify = old & NUD_VALID; if ((old & (NUD_INCOMPLETE | NUD_PROBE)) && (new & NUD_FAILED)) { neigh_invalidate(neigh); notify = 1; } goto out; } /* Compare new lladdr with cached one */ if (!dev->addr_len) { /* First case: device needs no address. */ lladdr = neigh->ha; } else if (lladdr) { /* The second case: if something is already cached and a new address is proposed: - compare new & old - if they are different, check override flag */ if ((old & NUD_VALID) && !memcmp(lladdr, neigh->ha, dev->addr_len)) lladdr = neigh->ha; } else { /* No address is supplied; if we know something, use it, otherwise discard the request. */ err = -EINVAL; if (!(old & NUD_VALID)) goto out; lladdr = neigh->ha; } /* Update confirmed timestamp for neighbour entry after we * received ARP packet even if it doesn't change IP to MAC binding. */ if (new & NUD_CONNECTED) neigh->confirmed = jiffies; /* If entry was valid and address is not changed, do not change entry state, if new one is STALE. */ err = 0; update_isrouter = flags & NEIGH_UPDATE_F_OVERRIDE_ISROUTER; if (old & NUD_VALID) { if (lladdr != neigh->ha && !(flags & NEIGH_UPDATE_F_OVERRIDE)) { update_isrouter = 0; if ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) && (old & NUD_CONNECTED)) { lladdr = neigh->ha; new = NUD_STALE; } else goto out; } else { if (lladdr == neigh->ha && new == NUD_STALE && !(flags & NEIGH_UPDATE_F_ADMIN)) new = old; } } /* Update timestamp only once we know we will make a change to the * neighbour entry. Otherwise we risk to move the locktime window with * noop updates and ignore relevant ARP updates. */ if (new != old || lladdr != neigh->ha) neigh->updated = jiffies; if (new != old) { neigh_del_timer(neigh); if (new & NUD_PROBE) atomic_set(&neigh->probes, 0); if (new & NUD_IN_TIMER) neigh_add_timer(neigh, (jiffies + ((new & NUD_REACHABLE) ? neigh->parms->reachable_time : 0))); neigh->nud_state = new; notify = 1; } if (lladdr != neigh->ha) { write_seqlock(&neigh->ha_lock); memcpy(&neigh->ha, lladdr, dev->addr_len); write_sequnlock(&neigh->ha_lock); neigh_update_hhs(neigh); if (!(new & NUD_CONNECTED)) neigh->confirmed = jiffies - (NEIGH_VAR(neigh->parms, BASE_REACHABLE_TIME) << 1); notify = 1; } if (new == old) goto out; if (new & NUD_CONNECTED) neigh_connect(neigh); else neigh_suspect(neigh); if (!(old & NUD_VALID)) { struct sk_buff *skb; /* Again: avoid dead loop if something went wrong */ while (neigh->nud_state & NUD_VALID && (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { struct dst_entry *dst = skb_dst(skb); struct neighbour *n2, *n1 = neigh; write_unlock_bh(&neigh->lock); rcu_read_lock(); /* Why not just use 'neigh' as-is? The problem is that * things such as shaper, eql, and sch_teql can end up * using alternative, different, neigh objects to output * the packet in the output path. So what we need to do * here is re-lookup the top-level neigh in the path so * we can reinject the packet there. */ n2 = NULL; if (dst && dst->obsolete != DST_OBSOLETE_DEAD) { n2 = dst_neigh_lookup_skb(dst, skb); if (n2) n1 = n2; } n1->output(n1, skb); if (n2) neigh_release(n2); rcu_read_unlock(); write_lock_bh(&neigh->lock); } __skb_queue_purge(&neigh->arp_queue); neigh->arp_queue_len_bytes = 0; } out: if (update_isrouter) { neigh->flags = (flags & NEIGH_UPDATE_F_ISROUTER) ? (neigh->flags | NTF_ROUTER) : (neigh->flags & ~NTF_ROUTER); } write_unlock_bh(&neigh->lock); if (notify) neigh_update_notify(neigh, nlmsg_pid); return err; } EXPORT_SYMBOL(neigh_update); /* Update the neigh to listen temporarily for probe responses, even if it is * in a NUD_FAILED state. The caller has to hold neigh->lock for writing. */ void __neigh_set_probe_once(struct neighbour *neigh) { if (neigh->dead) return; neigh->updated = jiffies; if (!(neigh->nud_state & NUD_FAILED)) return; neigh->nud_state = NUD_INCOMPLETE; atomic_set(&neigh->probes, neigh_max_probes(neigh)); neigh_add_timer(neigh, jiffies + NEIGH_VAR(neigh->parms, RETRANS_TIME)); } EXPORT_SYMBOL(__neigh_set_probe_once); struct neighbour *neigh_event_ns(struct neigh_table *tbl, u8 *lladdr, void *saddr, struct net_device *dev) { struct neighbour *neigh = __neigh_lookup(tbl, saddr, dev, lladdr || !dev->addr_len); if (neigh) neigh_update(neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_OVERRIDE, 0); return neigh; } EXPORT_SYMBOL(neigh_event_ns); /* called with read_lock_bh(&n->lock); */ static void neigh_hh_init(struct neighbour *n) { struct net_device *dev = n->dev; __be16 prot = n->tbl->protocol; struct hh_cache *hh = &n->hh; write_lock_bh(&n->lock); /* Only one thread can come in here and initialize the * hh_cache entry. */ if (!hh->hh_len) dev->header_ops->cache(n, hh, prot); write_unlock_bh(&n->lock); } /* Slow and careful. */ int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb) { int rc = 0; if (!neigh_event_send(neigh, skb)) { int err; struct net_device *dev = neigh->dev; unsigned int seq; if (dev->header_ops->cache && !READ_ONCE(neigh->hh.hh_len)) neigh_hh_init(neigh); do { __skb_pull(skb, skb_network_offset(skb)); seq = read_seqbegin(&neigh->ha_lock); err = dev_hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len); } while (read_seqretry(&neigh->ha_lock, seq)); if (err >= 0) rc = dev_queue_xmit(skb); else goto out_kfree_skb; } out: return rc; out_kfree_skb: rc = -EINVAL; kfree_skb(skb); goto out; } EXPORT_SYMBOL(neigh_resolve_output); /* As fast as possible without hh cache */ int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb) { struct net_device *dev = neigh->dev; unsigned int seq; int err; do { __skb_pull(skb, skb_network_offset(skb)); seq = read_seqbegin(&neigh->ha_lock); err = dev_hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len); } while (read_seqretry(&neigh->ha_lock, seq)); if (err >= 0) err = dev_queue_xmit(skb); else { err = -EINVAL; kfree_skb(skb); } return err; } EXPORT_SYMBOL(neigh_connected_output); int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb) { return dev_queue_xmit(skb); } EXPORT_SYMBOL(neigh_direct_output); static void neigh_proxy_process(struct timer_list *t) { struct neigh_table *tbl = from_timer(tbl, t, proxy_timer); long sched_next = 0; unsigned long now = jiffies; struct sk_buff *skb, *n; spin_lock(&tbl->proxy_queue.lock); skb_queue_walk_safe(&tbl->proxy_queue, skb, n) { long tdif = NEIGH_CB(skb)->sched_next - now; if (tdif <= 0) { struct net_device *dev = skb->dev; __skb_unlink(skb, &tbl->proxy_queue); if (tbl->proxy_redo && netif_running(dev)) { rcu_read_lock(); tbl->proxy_redo(skb); rcu_read_unlock(); } else { kfree_skb(skb); } dev_put(dev); } else if (!sched_next || tdif < sched_next) sched_next = tdif; } del_timer(&tbl->proxy_timer); if (sched_next) mod_timer(&tbl->proxy_timer, jiffies + sched_next); spin_unlock(&tbl->proxy_queue.lock); } void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb) { unsigned long now = jiffies; unsigned long sched_next = now + (prandom_u32() % NEIGH_VAR(p, PROXY_DELAY)); if (tbl->proxy_queue.qlen > NEIGH_VAR(p, PROXY_QLEN)) { kfree_skb(skb); return; } NEIGH_CB(skb)->sched_next = sched_next; NEIGH_CB(skb)->flags |= LOCALLY_ENQUEUED; spin_lock(&tbl->proxy_queue.lock); if (del_timer(&tbl->proxy_timer)) { if (time_before(tbl->proxy_timer.expires, sched_next)) sched_next = tbl->proxy_timer.expires; } skb_dst_drop(skb); dev_hold(skb->dev); __skb_queue_tail(&tbl->proxy_queue, skb); mod_timer(&tbl->proxy_timer, sched_next); spin_unlock(&tbl->proxy_queue.lock); } EXPORT_SYMBOL(pneigh_enqueue); static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl, struct net *net, int ifindex) { struct neigh_parms *p; list_for_each_entry(p, &tbl->parms_list, list) { if ((p->dev && p->dev->ifindex == ifindex && net_eq(neigh_parms_net(p), net)) || (!p->dev && !ifindex && net_eq(net, &init_net))) return p; } return NULL; } struct neigh_parms *neigh_parms_alloc(struct net_device *dev, struct neigh_table *tbl) { struct neigh_parms *p; struct net *net = dev_net(dev); const struct net_device_ops *ops = dev->netdev_ops; p = kmemdup(&tbl->parms, sizeof(*p), GFP_KERNEL); if (p) { p->tbl = tbl; refcount_set(&p->refcnt, 1); p->reachable_time = neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); dev_hold(dev); p->dev = dev; write_pnet(&p->net, net); p->sysctl_table = NULL; if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) { dev_put(dev); kfree(p); return NULL; } write_lock_bh(&tbl->lock); list_add(&p->list, &tbl->parms.list); write_unlock_bh(&tbl->lock); neigh_parms_data_state_cleanall(p); } return p; } EXPORT_SYMBOL(neigh_parms_alloc); static void neigh_rcu_free_parms(struct rcu_head *head) { struct neigh_parms *parms = container_of(head, struct neigh_parms, rcu_head); neigh_parms_put(parms); } void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) { if (!parms || parms == &tbl->parms) return; write_lock_bh(&tbl->lock); list_del(&parms->list); parms->dead = 1; write_unlock_bh(&tbl->lock); if (parms->dev) dev_put(parms->dev); call_rcu(&parms->rcu_head, neigh_rcu_free_parms); } EXPORT_SYMBOL(neigh_parms_release); static void neigh_parms_destroy(struct neigh_parms *parms) { kfree(parms); } static struct lock_class_key neigh_table_proxy_queue_class; static struct neigh_table *neigh_tables[NEIGH_NR_TABLES] __read_mostly; void neigh_table_init(int index, struct neigh_table *tbl) { unsigned long now = jiffies; unsigned long phsize; INIT_LIST_HEAD(&tbl->parms_list); list_add(&tbl->parms.list, &tbl->parms_list); write_pnet(&tbl->parms.net, &init_net); refcount_set(&tbl->parms.refcnt, 1); tbl->parms.reachable_time = neigh_rand_reach_time(NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME)); tbl->stats = alloc_percpu(struct neigh_statistics); if (!tbl->stats) panic("cannot create neighbour cache statistics"); #ifdef CONFIG_PROC_FS if (!proc_create_seq_data(tbl->id, 0, init_net.proc_net_stat, &neigh_stat_seq_ops, tbl)) panic("cannot create neighbour proc dir entry"); #endif RCU_INIT_POINTER(tbl->nht, neigh_hash_alloc(3)); phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *); tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL); if (!tbl->nht || !tbl->phash_buckets) panic("cannot allocate neighbour cache hashes"); if (!tbl->entry_size) tbl->entry_size = ALIGN(offsetof(struct neighbour, primary_key) + tbl->key_len, NEIGH_PRIV_ALIGN); else WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN); rwlock_init(&tbl->lock); INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work); queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, tbl->parms.reachable_time); timer_setup(&tbl->proxy_timer, neigh_proxy_process, 0); skb_queue_head_init_class(&tbl->proxy_queue, &neigh_table_proxy_queue_class); tbl->last_flush = now; tbl->last_rand = now + tbl->parms.reachable_time * 20; neigh_tables[index] = tbl; } EXPORT_SYMBOL(neigh_table_init); int neigh_table_clear(int index, struct neigh_table *tbl) { neigh_tables[index] = NULL; /* It is not clean... Fix it to unload IPv6 module safely */ cancel_delayed_work_sync(&tbl->gc_work); del_timer_sync(&tbl->proxy_timer); pneigh_queue_purge(&tbl->proxy_queue, NULL); neigh_ifdown(tbl, NULL); if (atomic_read(&tbl->entries)) pr_crit("neighbour leakage\n"); call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu, neigh_hash_free_rcu); tbl->nht = NULL; kfree(tbl->phash_buckets); tbl->phash_buckets = NULL; remove_proc_entry(tbl->id, init_net.proc_net_stat); free_percpu(tbl->stats); tbl->stats = NULL; return 0; } EXPORT_SYMBOL(neigh_table_clear); static struct neigh_table *neigh_find_table(int family) { struct neigh_table *tbl = NULL; switch (family) { case AF_INET: tbl = neigh_tables[NEIGH_ARP_TABLE]; break; case AF_INET6: tbl = neigh_tables[NEIGH_ND_TABLE]; break; case AF_DECnet: tbl = neigh_tables[NEIGH_DN_TABLE]; break; } return tbl; } static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *dst_attr; struct neigh_table *tbl; struct neighbour *neigh; struct net_device *dev = NULL; int err = -EINVAL; ASSERT_RTNL(); if (nlmsg_len(nlh) < sizeof(*ndm)) goto out; dst_attr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_DST); if (dst_attr == NULL) goto out; ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex) { dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { err = -ENODEV; goto out; } } tbl = neigh_find_table(ndm->ndm_family); if (tbl == NULL) return -EAFNOSUPPORT; if (nla_len(dst_attr) < (int)tbl->key_len) goto out; if (ndm->ndm_flags & NTF_PROXY) { err = pneigh_delete(tbl, net, nla_data(dst_attr), dev); goto out; } if (dev == NULL) goto out; neigh = neigh_lookup(tbl, nla_data(dst_attr), dev); if (neigh == NULL) { err = -ENOENT; goto out; } err = neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN, NETLINK_CB(skb).portid); write_lock_bh(&tbl->lock); neigh_release(neigh); neigh_remove_one(neigh, tbl); write_unlock_bh(&tbl->lock); out: return err; } static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE; struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct neigh_table *tbl; struct net_device *dev = NULL; struct neighbour *neigh; void *dst, *lladdr; int err; ASSERT_RTNL(); err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack); if (err < 0) goto out; err = -EINVAL; if (tb[NDA_DST] == NULL) goto out; ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex) { dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { err = -ENODEV; goto out; } if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) goto out; } tbl = neigh_find_table(ndm->ndm_family); if (tbl == NULL) return -EAFNOSUPPORT; if (nla_len(tb[NDA_DST]) < (int)tbl->key_len) goto out; dst = nla_data(tb[NDA_DST]); lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; if (ndm->ndm_flags & NTF_PROXY) { struct pneigh_entry *pn; err = -ENOBUFS; pn = pneigh_lookup(tbl, net, dst, dev, 1); if (pn) { pn->flags = ndm->ndm_flags; err = 0; } goto out; } if (dev == NULL) goto out; neigh = neigh_lookup(tbl, dst, dev); if (neigh == NULL) { if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { err = -ENOENT; goto out; } neigh = __neigh_lookup_errno(tbl, dst, dev); if (IS_ERR(neigh)) { err = PTR_ERR(neigh); goto out; } } else { if (nlh->nlmsg_flags & NLM_F_EXCL) { err = -EEXIST; neigh_release(neigh); goto out; } if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) flags &= ~NEIGH_UPDATE_F_OVERRIDE; } if (ndm->ndm_flags & NTF_EXT_LEARNED) flags |= NEIGH_UPDATE_F_EXT_LEARNED; if (ndm->ndm_flags & NTF_USE) { neigh_event_send(neigh, NULL); err = 0; } else err = neigh_update(neigh, lladdr, ndm->ndm_state, flags, NETLINK_CB(skb).portid); neigh_release(neigh); out: return err; } static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms) { struct nlattr *nest; nest = nla_nest_start(skb, NDTA_PARMS); if (nest == NULL) return -ENOBUFS; if ((parms->dev && nla_put_u32(skb, NDTPA_IFINDEX, parms->dev->ifindex)) || nla_put_u32(skb, NDTPA_REFCNT, refcount_read(&parms->refcnt)) || nla_put_u32(skb, NDTPA_QUEUE_LENBYTES, NEIGH_VAR(parms, QUEUE_LEN_BYTES)) || /* approximative value for deprecated QUEUE_LEN (in packets) */ nla_put_u32(skb, NDTPA_QUEUE_LEN, NEIGH_VAR(parms, QUEUE_LEN_BYTES) / SKB_TRUESIZE(ETH_FRAME_LEN)) || nla_put_u32(skb, NDTPA_PROXY_QLEN, NEIGH_VAR(parms, PROXY_QLEN)) || nla_put_u32(skb, NDTPA_APP_PROBES, NEIGH_VAR(parms, APP_PROBES)) || nla_put_u32(skb, NDTPA_UCAST_PROBES, NEIGH_VAR(parms, UCAST_PROBES)) || nla_put_u32(skb, NDTPA_MCAST_PROBES, NEIGH_VAR(parms, MCAST_PROBES)) || nla_put_u32(skb, NDTPA_MCAST_REPROBES, NEIGH_VAR(parms, MCAST_REPROBES)) || nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time, NDTPA_PAD) || nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME, NEIGH_VAR(parms, BASE_REACHABLE_TIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_GC_STALETIME, NEIGH_VAR(parms, GC_STALETIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_DELAY_PROBE_TIME, NEIGH_VAR(parms, DELAY_PROBE_TIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_RETRANS_TIME, NEIGH_VAR(parms, RETRANS_TIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_ANYCAST_DELAY, NEIGH_VAR(parms, ANYCAST_DELAY), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_PROXY_DELAY, NEIGH_VAR(parms, PROXY_DELAY), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_LOCKTIME, NEIGH_VAR(parms, LOCKTIME), NDTPA_PAD)) goto nla_put_failure; return nla_nest_end(skb, nest); nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, u32 pid, u32 seq, int type, int flags) { struct nlmsghdr *nlh; struct ndtmsg *ndtmsg; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags); if (nlh == NULL) return -EMSGSIZE; ndtmsg = nlmsg_data(nlh); read_lock_bh(&tbl->lock); ndtmsg->ndtm_family = tbl->family; ndtmsg->ndtm_pad1 = 0; ndtmsg->ndtm_pad2 = 0; if (nla_put_string(skb, NDTA_NAME, tbl->id) || nla_put_msecs(skb, NDTA_GC_INTERVAL, tbl->gc_interval, NDTA_PAD) || nla_put_u32(skb, NDTA_THRESH1, tbl->gc_thresh1) || nla_put_u32(skb, NDTA_THRESH2, tbl->gc_thresh2) || nla_put_u32(skb, NDTA_THRESH3, tbl->gc_thresh3)) goto nla_put_failure; { unsigned long now = jiffies; long flush_delta = now - tbl->last_flush; long rand_delta = now - tbl->last_rand; struct neigh_hash_table *nht; struct ndt_config ndc = { .ndtc_key_len = tbl->key_len, .ndtc_entry_size = tbl->entry_size, .ndtc_entries = atomic_read(&tbl->entries), .ndtc_last_flush = jiffies_to_msecs(flush_delta), .ndtc_last_rand = jiffies_to_msecs(rand_delta), .ndtc_proxy_qlen = tbl->proxy_queue.qlen, }; rcu_read_lock_bh(); nht = rcu_dereference_bh(tbl->nht); ndc.ndtc_hash_rnd = nht->hash_rnd[0]; ndc.ndtc_hash_mask = ((1 << nht->hash_shift) - 1); rcu_read_unlock_bh(); if (nla_put(skb, NDTA_CONFIG, sizeof(ndc), &ndc)) goto nla_put_failure; } { int cpu; struct ndt_stats ndst; memset(&ndst, 0, sizeof(ndst)); for_each_possible_cpu(cpu) { struct neigh_statistics *st; st = per_cpu_ptr(tbl->stats, cpu); ndst.ndts_allocs += st->allocs; ndst.ndts_destroys += st->destroys; ndst.ndts_hash_grows += st->hash_grows; ndst.ndts_res_failed += st->res_failed; ndst.ndts_lookups += st->lookups; ndst.ndts_hits += st->hits; ndst.ndts_rcv_probes_mcast += st->rcv_probes_mcast; ndst.ndts_rcv_probes_ucast += st->rcv_probes_ucast; ndst.ndts_periodic_gc_runs += st->periodic_gc_runs; ndst.ndts_forced_gc_runs += st->forced_gc_runs; ndst.ndts_table_fulls += st->table_fulls; } if (nla_put_64bit(skb, NDTA_STATS, sizeof(ndst), &ndst, NDTA_PAD)) goto nla_put_failure; } BUG_ON(tbl->parms.dev); if (neightbl_fill_parms(skb, &tbl->parms) < 0) goto nla_put_failure; read_unlock_bh(&tbl->lock); nlmsg_end(skb, nlh); return 0; nla_put_failure: read_unlock_bh(&tbl->lock); nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int neightbl_fill_param_info(struct sk_buff *skb, struct neigh_table *tbl, struct neigh_parms *parms, u32 pid, u32 seq, int type, unsigned int flags) { struct ndtmsg *ndtmsg; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags); if (nlh == NULL) return -EMSGSIZE; ndtmsg = nlmsg_data(nlh); read_lock_bh(&tbl->lock); ndtmsg->ndtm_family = tbl->family; ndtmsg->ndtm_pad1 = 0; ndtmsg->ndtm_pad2 = 0; if (nla_put_string(skb, NDTA_NAME, tbl->id) < 0 || neightbl_fill_parms(skb, parms) < 0) goto errout; read_unlock_bh(&tbl->lock); nlmsg_end(skb, nlh); return 0; errout: read_unlock_bh(&tbl->lock); nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static const struct nla_policy nl_neightbl_policy[NDTA_MAX+1] = { [NDTA_NAME] = { .type = NLA_STRING }, [NDTA_THRESH1] = { .type = NLA_U32 }, [NDTA_THRESH2] = { .type = NLA_U32 }, [NDTA_THRESH3] = { .type = NLA_U32 }, [NDTA_GC_INTERVAL] = { .type = NLA_U64 }, [NDTA_PARMS] = { .type = NLA_NESTED }, }; static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = { [NDTPA_IFINDEX] = { .type = NLA_U32 }, [NDTPA_QUEUE_LEN] = { .type = NLA_U32 }, [NDTPA_PROXY_QLEN] = { .type = NLA_U32 }, [NDTPA_APP_PROBES] = { .type = NLA_U32 }, [NDTPA_UCAST_PROBES] = { .type = NLA_U32 }, [NDTPA_MCAST_PROBES] = { .type = NLA_U32 }, [NDTPA_MCAST_REPROBES] = { .type = NLA_U32 }, [NDTPA_BASE_REACHABLE_TIME] = { .type = NLA_U64 }, [NDTPA_GC_STALETIME] = { .type = NLA_U64 }, [NDTPA_DELAY_PROBE_TIME] = { .type = NLA_U64 }, [NDTPA_RETRANS_TIME] = { .type = NLA_U64 }, [NDTPA_ANYCAST_DELAY] = { .type = NLA_U64 }, [NDTPA_PROXY_DELAY] = { .type = NLA_U64 }, [NDTPA_LOCKTIME] = { .type = NLA_U64 }, }; static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct neigh_table *tbl; struct ndtmsg *ndtmsg; struct nlattr *tb[NDTA_MAX+1]; bool found = false; int err, tidx; err = nlmsg_parse(nlh, sizeof(*ndtmsg), tb, NDTA_MAX, nl_neightbl_policy, extack); if (err < 0) goto errout; if (tb[NDTA_NAME] == NULL) { err = -EINVAL; goto errout; } ndtmsg = nlmsg_data(nlh); for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { tbl = neigh_tables[tidx]; if (!tbl) continue; if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family) continue; if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) { found = true; break; } } if (!found) return -ENOENT; /* * We acquire tbl->lock to be nice to the periodic timers and * make sure they always see a consistent set of values. */ write_lock_bh(&tbl->lock); if (tb[NDTA_PARMS]) { struct nlattr *tbp[NDTPA_MAX+1]; struct neigh_parms *p; int i, ifindex = 0; err = nla_parse_nested(tbp, NDTPA_MAX, tb[NDTA_PARMS], nl_ntbl_parm_policy, extack); if (err < 0) goto errout_tbl_lock; if (tbp[NDTPA_IFINDEX]) ifindex = nla_get_u32(tbp[NDTPA_IFINDEX]); p = lookup_neigh_parms(tbl, net, ifindex); if (p == NULL) { err = -ENOENT; goto errout_tbl_lock; } for (i = 1; i <= NDTPA_MAX; i++) { if (tbp[i] == NULL) continue; switch (i) { case NDTPA_QUEUE_LEN: NEIGH_VAR_SET(p, QUEUE_LEN_BYTES, nla_get_u32(tbp[i]) * SKB_TRUESIZE(ETH_FRAME_LEN)); break; case NDTPA_QUEUE_LENBYTES: NEIGH_VAR_SET(p, QUEUE_LEN_BYTES, nla_get_u32(tbp[i])); break; case NDTPA_PROXY_QLEN: NEIGH_VAR_SET(p, PROXY_QLEN, nla_get_u32(tbp[i])); break; case NDTPA_APP_PROBES: NEIGH_VAR_SET(p, APP_PROBES, nla_get_u32(tbp[i])); break; case NDTPA_UCAST_PROBES: NEIGH_VAR_SET(p, UCAST_PROBES, nla_get_u32(tbp[i])); break; case NDTPA_MCAST_PROBES: NEIGH_VAR_SET(p, MCAST_PROBES, nla_get_u32(tbp[i])); break; case NDTPA_MCAST_REPROBES: NEIGH_VAR_SET(p, MCAST_REPROBES, nla_get_u32(tbp[i])); break; case NDTPA_BASE_REACHABLE_TIME: NEIGH_VAR_SET(p, BASE_REACHABLE_TIME, nla_get_msecs(tbp[i])); /* update reachable_time as well, otherwise, the change will * only be effective after the next time neigh_periodic_work * decides to recompute it (can be multiple minutes) */ p->reachable_time = neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); break; case NDTPA_GC_STALETIME: NEIGH_VAR_SET(p, GC_STALETIME, nla_get_msecs(tbp[i])); break; case NDTPA_DELAY_PROBE_TIME: NEIGH_VAR_SET(p, DELAY_PROBE_TIME, nla_get_msecs(tbp[i])); call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); break; case NDTPA_RETRANS_TIME: NEIGH_VAR_SET(p, RETRANS_TIME, nla_get_msecs(tbp[i])); break; case NDTPA_ANYCAST_DELAY: NEIGH_VAR_SET(p, ANYCAST_DELAY, nla_get_msecs(tbp[i])); break; case NDTPA_PROXY_DELAY: NEIGH_VAR_SET(p, PROXY_DELAY, nla_get_msecs(tbp[i])); break; case NDTPA_LOCKTIME: NEIGH_VAR_SET(p, LOCKTIME, nla_get_msecs(tbp[i])); break; } } } err = -ENOENT; if ((tb[NDTA_THRESH1] || tb[NDTA_THRESH2] || tb[NDTA_THRESH3] || tb[NDTA_GC_INTERVAL]) && !net_eq(net, &init_net)) goto errout_tbl_lock; if (tb[NDTA_THRESH1]) tbl->gc_thresh1 = nla_get_u32(tb[NDTA_THRESH1]); if (tb[NDTA_THRESH2]) tbl->gc_thresh2 = nla_get_u32(tb[NDTA_THRESH2]); if (tb[NDTA_THRESH3]) tbl->gc_thresh3 = nla_get_u32(tb[NDTA_THRESH3]); if (tb[NDTA_GC_INTERVAL]) tbl->gc_interval = nla_get_msecs(tb[NDTA_GC_INTERVAL]); err = 0; errout_tbl_lock: write_unlock_bh(&tbl->lock); errout: return err; } static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); int family, tidx, nidx = 0; int tbl_skip = cb->args[0]; int neigh_skip = cb->args[1]; struct neigh_table *tbl; family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family; for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { struct neigh_parms *p; tbl = neigh_tables[tidx]; if (!tbl) continue; if (tidx < tbl_skip || (family && tbl->family != family)) continue; if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGHTBL, NLM_F_MULTI) < 0) break; nidx = 0; p = list_next_entry(&tbl->parms, list); list_for_each_entry_from(p, &tbl->parms_list, list) { if (!net_eq(neigh_parms_net(p), net)) continue; if (nidx < neigh_skip) goto next; if (neightbl_fill_param_info(skb, tbl, p, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGHTBL, NLM_F_MULTI) < 0) goto out; next: nidx++; } neigh_skip = 0; } out: cb->args[0] = tidx; cb->args[1] = nidx; return skb->len; } static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, u32 pid, u32 seq, int type, unsigned int flags) { unsigned long now = jiffies; struct nda_cacheinfo ci; struct nlmsghdr *nlh; struct ndmsg *ndm; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags); if (nlh == NULL) return -EMSGSIZE; ndm = nlmsg_data(nlh); ndm->ndm_family = neigh->ops->family; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; ndm->ndm_flags = neigh->flags; ndm->ndm_type = neigh->type; ndm->ndm_ifindex = neigh->dev->ifindex; if (nla_put(skb, NDA_DST, neigh->tbl->key_len, neigh->primary_key)) goto nla_put_failure; read_lock_bh(&neigh->lock); ndm->ndm_state = neigh->nud_state; if (neigh->nud_state & NUD_VALID) { char haddr[MAX_ADDR_LEN]; neigh_ha_snapshot(haddr, neigh, neigh->dev); if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) { read_unlock_bh(&neigh->lock); goto nla_put_failure; } } ci.ndm_used = jiffies_to_clock_t(now - neigh->used); ci.ndm_confirmed = jiffies_to_clock_t(now - neigh->confirmed); ci.ndm_updated = jiffies_to_clock_t(now - neigh->updated); ci.ndm_refcnt = refcount_read(&neigh->refcnt) - 1; read_unlock_bh(&neigh->lock); if (nla_put_u32(skb, NDA_PROBES, atomic_read(&neigh->probes)) || nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, u32 pid, u32 seq, int type, unsigned int flags, struct neigh_table *tbl) { struct nlmsghdr *nlh; struct ndmsg *ndm; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags); if (nlh == NULL) return -EMSGSIZE; ndm = nlmsg_data(nlh); ndm->ndm_family = tbl->family; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; ndm->ndm_flags = pn->flags | NTF_PROXY; ndm->ndm_type = RTN_UNICAST; ndm->ndm_ifindex = pn->dev ? pn->dev->ifindex : 0; ndm->ndm_state = NUD_NONE; if (nla_put(skb, NDA_DST, tbl->key_len, pn->key)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid) { call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); __neigh_notify(neigh, RTM_NEWNEIGH, 0, nlmsg_pid); } static bool neigh_master_filtered(struct net_device *dev, int master_idx) { struct net_device *master; if (!master_idx) return false; master = netdev_master_upper_dev_get(dev); if (!master || master->ifindex != master_idx) return true; return false; } static bool neigh_ifindex_filtered(struct net_device *dev, int filter_idx) { if (filter_idx && dev->ifindex != filter_idx) return true; return false; } static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); const struct nlmsghdr *nlh = cb->nlh; struct nlattr *tb[NDA_MAX + 1]; struct neighbour *n; int rc, h, s_h = cb->args[1]; int idx, s_idx = idx = cb->args[2]; struct neigh_hash_table *nht; int filter_master_idx = 0, filter_idx = 0; unsigned int flags = NLM_F_MULTI; int err; err = nlmsg_parse(nlh, sizeof(struct ndmsg), tb, NDA_MAX, NULL, NULL); if (!err) { if (tb[NDA_IFINDEX]) { if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32)) return -EINVAL; filter_idx = nla_get_u32(tb[NDA_IFINDEX]); } if (tb[NDA_MASTER]) { if (nla_len(tb[NDA_MASTER]) != sizeof(u32)) return -EINVAL; filter_master_idx = nla_get_u32(tb[NDA_MASTER]); } if (filter_idx || filter_master_idx) flags |= NLM_F_DUMP_FILTERED; } rcu_read_lock_bh(); nht = rcu_dereference_bh(tbl->nht); for (h = s_h; h < (1 << nht->hash_shift); h++) { if (h > s_h) s_idx = 0; for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0; n != NULL; n = rcu_dereference_bh(n->next)) { if (idx < s_idx || !net_eq(dev_net(n->dev), net)) goto next; if (neigh_ifindex_filtered(n->dev, filter_idx) || neigh_master_filtered(n->dev, filter_master_idx)) goto next; if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, flags) < 0) { rc = -1; goto out; } next: idx++; } } rc = skb->len; out: rcu_read_unlock_bh(); cb->args[1] = h; cb->args[2] = idx; return rc; } static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, struct netlink_callback *cb) { struct pneigh_entry *n; struct net *net = sock_net(skb->sk); int rc, h, s_h = cb->args[3]; int idx, s_idx = idx = cb->args[4]; read_lock_bh(&tbl->lock); for (h = s_h; h <= PNEIGH_HASHMASK; h++) { if (h > s_h) s_idx = 0; for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) { if (idx < s_idx || pneigh_net(n) != net) goto next; if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, NLM_F_MULTI, tbl) < 0) { read_unlock_bh(&tbl->lock); rc = -1; goto out; } next: idx++; } } read_unlock_bh(&tbl->lock); rc = skb->len; out: cb->args[3] = h; cb->args[4] = idx; return rc; } static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) { struct neigh_table *tbl; int t, family, s_t; int proxy = 0; int err; family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family; /* check for full ndmsg structure presence, family member is * the same for both structures */ if (nlmsg_len(cb->nlh) >= sizeof(struct ndmsg) && ((struct ndmsg *) nlmsg_data(cb->nlh))->ndm_flags == NTF_PROXY) proxy = 1; s_t = cb->args[0]; for (t = 0; t < NEIGH_NR_TABLES; t++) { tbl = neigh_tables[t]; if (!tbl) continue; if (t < s_t || (family && tbl->family != family)) continue; if (t > s_t) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); if (proxy) err = pneigh_dump_table(tbl, skb, cb); else err = neigh_dump_table(tbl, skb, cb); if (err < 0) break; } cb->args[0] = t; return skb->len; } void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie) { int chain; struct neigh_hash_table *nht; rcu_read_lock_bh(); nht = rcu_dereference_bh(tbl->nht); read_lock(&tbl->lock); /* avoid resizes */ for (chain = 0; chain < (1 << nht->hash_shift); chain++) { struct neighbour *n; for (n = rcu_dereference_bh(nht->hash_buckets[chain]); n != NULL; n = rcu_dereference_bh(n->next)) cb(n, cookie); } read_unlock(&tbl->lock); rcu_read_unlock_bh(); } EXPORT_SYMBOL(neigh_for_each); /* The tbl->lock must be held as a writer and BH disabled. */ void __neigh_for_each_release(struct neigh_table *tbl, int (*cb)(struct neighbour *)) { int chain; struct neigh_hash_table *nht; nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); for (chain = 0; chain < (1 << nht->hash_shift); chain++) { struct neighbour *n; struct neighbour __rcu **np; np = &nht->hash_buckets[chain]; while ((n = rcu_dereference_protected(*np, lockdep_is_held(&tbl->lock))) != NULL) { int release; write_lock(&n->lock); release = cb(n); if (release) { rcu_assign_pointer(*np, rcu_dereference_protected(n->next, lockdep_is_held(&tbl->lock))); n->dead = 1; } else np = &n->next; write_unlock(&n->lock); if (release) neigh_cleanup_and_release(n); } } } EXPORT_SYMBOL(__neigh_for_each_release); int neigh_xmit(int index, struct net_device *dev, const void *addr, struct sk_buff *skb) { int err = -EAFNOSUPPORT; if (likely(index < NEIGH_NR_TABLES)) { struct neigh_table *tbl; struct neighbour *neigh; tbl = neigh_tables[index]; if (!tbl) goto out; rcu_read_lock_bh(); if (index == NEIGH_ARP_TABLE) { u32 key = *((u32 *)addr); neigh = __ipv4_neigh_lookup_noref(dev, key); } else { neigh = __neigh_lookup_noref(tbl, addr, dev); } if (!neigh) neigh = __neigh_create(tbl, addr, dev, false); err = PTR_ERR(neigh); if (IS_ERR(neigh)) { rcu_read_unlock_bh(); goto out_kfree_skb; } err = neigh->output(neigh, skb); rcu_read_unlock_bh(); } else if (index == NEIGH_LINK_TABLE) { err = dev_hard_header(skb, dev, ntohs(skb->protocol), addr, NULL, skb->len); if (err < 0) goto out_kfree_skb; err = dev_queue_xmit(skb); } out: return err; out_kfree_skb: kfree_skb(skb); goto out; } EXPORT_SYMBOL(neigh_xmit); #ifdef CONFIG_PROC_FS static struct neighbour *neigh_get_first(struct seq_file *seq) { struct neigh_seq_state *state = seq->private; struct net *net = seq_file_net(seq); struct neigh_hash_table *nht = state->nht; struct neighbour *n = NULL; int bucket = state->bucket; state->flags &= ~NEIGH_SEQ_IS_PNEIGH; for (bucket = 0; bucket < (1 << nht->hash_shift); bucket++) { n = rcu_dereference_bh(nht->hash_buckets[bucket]); while (n) { if (!net_eq(dev_net(n->dev), net)) goto next; if (state->neigh_sub_iter) { loff_t fakep = 0; void *v; v = state->neigh_sub_iter(state, n, &fakep); if (!v) goto next; } if (!(state->flags & NEIGH_SEQ_SKIP_NOARP)) break; if (n->nud_state & ~NUD_NOARP) break; next: n = rcu_dereference_bh(n->next); } if (n) break; } state->bucket = bucket; return n; } static struct neighbour *neigh_get_next(struct seq_file *seq, struct neighbour *n, loff_t *pos) { struct neigh_seq_state *state = seq->private; struct net *net = seq_file_net(seq); struct neigh_hash_table *nht = state->nht; if (state->neigh_sub_iter) { void *v = state->neigh_sub_iter(state, n, pos); if (v) return n; } n = rcu_dereference_bh(n->next); while (1) { while (n) { if (!net_eq(dev_net(n->dev), net)) goto next; if (state->neigh_sub_iter) { void *v = state->neigh_sub_iter(state, n, pos); if (v) return n; goto next; } if (!(state->flags & NEIGH_SEQ_SKIP_NOARP)) break; if (n->nud_state & ~NUD_NOARP) break; next: n = rcu_dereference_bh(n->next); } if (n) break; if (++state->bucket >= (1 << nht->hash_shift)) break; n = rcu_dereference_bh(nht->hash_buckets[state->bucket]); } if (n && pos) --(*pos); return n; } static struct neighbour *neigh_get_idx(struct seq_file *seq, loff_t *pos) { struct neighbour *n = neigh_get_first(seq); if (n) { --(*pos); while (*pos) { n = neigh_get_next(seq, n, pos); if (!n) break; } } return *pos ? NULL : n; } static struct pneigh_entry *pneigh_get_first(struct seq_file *seq) { struct neigh_seq_state *state = seq->private; struct net *net = seq_file_net(seq); struct neigh_table *tbl = state->tbl; struct pneigh_entry *pn = NULL; int bucket = state->bucket; state->flags |= NEIGH_SEQ_IS_PNEIGH; for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) { pn = tbl->phash_buckets[bucket]; while (pn && !net_eq(pneigh_net(pn), net)) pn = pn->next; if (pn) break; } state->bucket = bucket; return pn; } static struct pneigh_entry *pneigh_get_next(struct seq_file *seq, struct pneigh_entry *pn, loff_t *pos) { struct neigh_seq_state *state = seq->private; struct net *net = seq_file_net(seq); struct neigh_table *tbl = state->tbl; do { pn = pn->next; } while (pn && !net_eq(pneigh_net(pn), net)); while (!pn) { if (++state->bucket > PNEIGH_HASHMASK) break; pn = tbl->phash_buckets[state->bucket]; while (pn && !net_eq(pneigh_net(pn), net)) pn = pn->next; if (pn) break; } if (pn && pos) --(*pos); return pn; } static struct pneigh_entry *pneigh_get_idx(struct seq_file *seq, loff_t *pos) { struct pneigh_entry *pn = pneigh_get_first(seq); if (pn) { --(*pos); while (*pos) { pn = pneigh_get_next(seq, pn, pos); if (!pn) break; } } return *pos ? NULL : pn; } static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos) { struct neigh_seq_state *state = seq->private; void *rc; loff_t idxpos = *pos; rc = neigh_get_idx(seq, &idxpos); if (!rc && !(state->flags & NEIGH_SEQ_NEIGH_ONLY)) rc = pneigh_get_idx(seq, &idxpos); return rc; } void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags) __acquires(tbl->lock) __acquires(rcu_bh) { struct neigh_seq_state *state = seq->private; state->tbl = tbl; state->bucket = 0; state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH); rcu_read_lock_bh(); state->nht = rcu_dereference_bh(tbl->nht); read_lock(&tbl->lock); return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN; } EXPORT_SYMBOL(neigh_seq_start); void *neigh_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct neigh_seq_state *state; void *rc; if (v == SEQ_START_TOKEN) { rc = neigh_get_first(seq); goto out; } state = seq->private; if (!(state->flags & NEIGH_SEQ_IS_PNEIGH)) { rc = neigh_get_next(seq, v, NULL); if (rc) goto out; if (!(state->flags & NEIGH_SEQ_NEIGH_ONLY)) rc = pneigh_get_first(seq); } else { BUG_ON(state->flags & NEIGH_SEQ_NEIGH_ONLY); rc = pneigh_get_next(seq, v, NULL); } out: ++(*pos); return rc; } EXPORT_SYMBOL(neigh_seq_next); void neigh_seq_stop(struct seq_file *seq, void *v) __releases(tbl->lock) __releases(rcu_bh) { struct neigh_seq_state *state = seq->private; struct neigh_table *tbl = state->tbl; read_unlock(&tbl->lock); rcu_read_unlock_bh(); } EXPORT_SYMBOL(neigh_seq_stop); /* statistics via seq_file */ static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos) { struct neigh_table *tbl = PDE_DATA(file_inode(seq->file)); int cpu; if (*pos == 0) return SEQ_START_TOKEN; for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; return per_cpu_ptr(tbl->stats, cpu); } return NULL; } static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct neigh_table *tbl = PDE_DATA(file_inode(seq->file)); int cpu; for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; return per_cpu_ptr(tbl->stats, cpu); } (*pos)++; return NULL; } static void neigh_stat_seq_stop(struct seq_file *seq, void *v) { } static int neigh_stat_seq_show(struct seq_file *seq, void *v) { struct neigh_table *tbl = PDE_DATA(file_inode(seq->file)); struct neigh_statistics *st = v; if (v == SEQ_START_TOKEN) { seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards table_fulls\n"); return 0; } seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx " "%08lx %08lx %08lx %08lx %08lx %08lx\n", atomic_read(&tbl->entries), st->allocs, st->destroys, st->hash_grows, st->lookups, st->hits, st->res_failed, st->rcv_probes_mcast, st->rcv_probes_ucast, st->periodic_gc_runs, st->forced_gc_runs, st->unres_discards, st->table_fulls ); return 0; } static const struct seq_operations neigh_stat_seq_ops = { .start = neigh_stat_seq_start, .next = neigh_stat_seq_next, .stop = neigh_stat_seq_stop, .show = neigh_stat_seq_show, }; #endif /* CONFIG_PROC_FS */ static inline size_t neigh_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */ + nla_total_size(sizeof(struct nda_cacheinfo)) + nla_total_size(4); /* NDA_PROBES */ } static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid) { struct net *net = dev_net(n->dev); struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(neigh_nlmsg_size(), GFP_ATOMIC); if (skb == NULL) goto errout; err = neigh_fill_info(skb, n, pid, 0, type, flags); if (err < 0) { /* -EMSGSIZE implies BUG in neigh_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); return; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); } void neigh_app_ns(struct neighbour *n) { __neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST, 0); } EXPORT_SYMBOL(neigh_app_ns); #ifdef CONFIG_SYSCTL static int zero; static int int_max = INT_MAX; static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN); static int proc_unres_qlen(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int size, ret; struct ctl_table tmp = *ctl; tmp.extra1 = &zero; tmp.extra2 = &unres_qlen_max; tmp.data = &size; size = *(int *)ctl->data / SKB_TRUESIZE(ETH_FRAME_LEN); ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && !ret) *(int *)ctl->data = size * SKB_TRUESIZE(ETH_FRAME_LEN); return ret; } static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev, int family) { switch (family) { case AF_INET: return __in_dev_arp_parms_get_rcu(dev); case AF_INET6: return __in6_dev_nd_parms_get_rcu(dev); } return NULL; } static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p, int index) { struct net_device *dev; int family = neigh_parms_family(p); rcu_read_lock(); for_each_netdev_rcu(net, dev) { struct neigh_parms *dst_p = neigh_get_dev_parms_rcu(dev, family); if (dst_p && !test_bit(index, dst_p->data_state)) dst_p->data[index] = p->data[index]; } rcu_read_unlock(); } static void neigh_proc_update(struct ctl_table *ctl, int write) { struct net_device *dev = ctl->extra1; struct neigh_parms *p = ctl->extra2; struct net *net = neigh_parms_net(p); int index = (int *) ctl->data - p->data; if (!write) return; set_bit(index, p->data_state); if (index == NEIGH_VAR_DELAY_PROBE_TIME) call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); if (!dev) /* NULL dev means this is default value */ neigh_copy_dflt_parms(net, p, index); } static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tmp = *ctl; int ret; tmp.extra1 = &zero; tmp.extra2 = &int_max; ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } int neigh_proc_dointvec(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } EXPORT_SYMBOL(neigh_proc_dointvec); int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } EXPORT_SYMBOL(neigh_proc_dointvec_jiffies); static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_userhz_jiffies(ctl, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies); static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_unres_qlen(ctl, write, buffer, lenp, ppos); neigh_proc_update(ctl, write); return ret; } static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct neigh_parms *p = ctl->extra2; int ret; if (strcmp(ctl->procname, "base_reachable_time") == 0) ret = neigh_proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); else if (strcmp(ctl->procname, "base_reachable_time_ms") == 0) ret = neigh_proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); else ret = -1; if (write && ret == 0) { /* update reachable_time as well, otherwise, the change will * only be effective after the next time neigh_periodic_work * decides to recompute it */ p->reachable_time = neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); } return ret; } #define NEIGH_PARMS_DATA_OFFSET(index) \ (&((struct neigh_parms *) 0)->data[index]) #define NEIGH_SYSCTL_ENTRY(attr, data_attr, name, mval, proc) \ [NEIGH_VAR_ ## attr] = { \ .procname = name, \ .data = NEIGH_PARMS_DATA_OFFSET(NEIGH_VAR_ ## data_attr), \ .maxlen = sizeof(int), \ .mode = mval, \ .proc_handler = proc, \ } #define NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(attr, name) \ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_zero_intmax) #define NEIGH_SYSCTL_JIFFIES_ENTRY(attr, name) \ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_jiffies) #define NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(attr, name) \ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_userhz_jiffies) #define NEIGH_SYSCTL_MS_JIFFIES_ENTRY(attr, name) \ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_ms_jiffies) #define NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(attr, data_attr, name) \ NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_ms_jiffies) #define NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(attr, data_attr, name) \ NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_unres_qlen) static struct neigh_sysctl_table { struct ctl_table_header *sysctl_header; struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1]; } neigh_sysctl_template __read_mostly = { .neigh_vars = { NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(UCAST_PROBES, "ucast_solicit"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(APP_PROBES, "app_solicit"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_REPROBES, "mcast_resolicit"), NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(RETRANS_TIME, "retrans_time"), NEIGH_SYSCTL_JIFFIES_ENTRY(BASE_REACHABLE_TIME, "base_reachable_time"), NEIGH_SYSCTL_JIFFIES_ENTRY(DELAY_PROBE_TIME, "delay_first_probe_time"), NEIGH_SYSCTL_JIFFIES_ENTRY(GC_STALETIME, "gc_stale_time"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(QUEUE_LEN_BYTES, "unres_qlen_bytes"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(PROXY_QLEN, "proxy_qlen"), NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(ANYCAST_DELAY, "anycast_delay"), NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(PROXY_DELAY, "proxy_delay"), NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(LOCKTIME, "locktime"), NEIGH_SYSCTL_UNRES_QLEN_REUSED_ENTRY(QUEUE_LEN, QUEUE_LEN_BYTES, "unres_qlen"), NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(RETRANS_TIME_MS, RETRANS_TIME, "retrans_time_ms"), NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(BASE_REACHABLE_TIME_MS, BASE_REACHABLE_TIME, "base_reachable_time_ms"), [NEIGH_VAR_GC_INTERVAL] = { .procname = "gc_interval", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, [NEIGH_VAR_GC_THRESH1] = { .procname = "gc_thresh1", .maxlen = sizeof(int), .mode = 0644, .extra1 = &zero, .extra2 = &int_max, .proc_handler = proc_dointvec_minmax, }, [NEIGH_VAR_GC_THRESH2] = { .procname = "gc_thresh2", .maxlen = sizeof(int), .mode = 0644, .extra1 = &zero, .extra2 = &int_max, .proc_handler = proc_dointvec_minmax, }, [NEIGH_VAR_GC_THRESH3] = { .procname = "gc_thresh3", .maxlen = sizeof(int), .mode = 0644, .extra1 = &zero, .extra2 = &int_max, .proc_handler = proc_dointvec_minmax, }, {}, }, }; int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, proc_handler *handler) { int i; struct neigh_sysctl_table *t; const char *dev_name_source; char neigh_path[ sizeof("net//neigh/") + IFNAMSIZ + IFNAMSIZ ]; char *p_name; t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL); if (!t) goto err; for (i = 0; i < NEIGH_VAR_GC_INTERVAL; i++) { t->neigh_vars[i].data += (long) p; t->neigh_vars[i].extra1 = dev; t->neigh_vars[i].extra2 = p; } if (dev) { dev_name_source = dev->name; /* Terminate the table early */ memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0, sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL])); } else { struct neigh_table *tbl = p->tbl; dev_name_source = "default"; t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = &tbl->gc_interval; t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = &tbl->gc_thresh1; t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = &tbl->gc_thresh2; t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = &tbl->gc_thresh3; } if (handler) { /* RetransTime */ t->neigh_vars[NEIGH_VAR_RETRANS_TIME].proc_handler = handler; /* ReachableTime */ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = handler; /* RetransTime (in milliseconds)*/ t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].proc_handler = handler; /* ReachableTime (in milliseconds) */ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = handler; } else { /* Those handlers will update p->reachable_time after * base_reachable_time(_ms) is set to ensure the new timer starts being * applied after the next neighbour update instead of waiting for * neigh_periodic_work to update its value (can be multiple minutes) * So any handler that replaces them should do this as well */ /* ReachableTime */ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = neigh_proc_base_reachable_time; /* ReachableTime (in milliseconds) */ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = neigh_proc_base_reachable_time; } /* Don't export sysctls to unprivileged users */ if (neigh_parms_net(p)->user_ns != &init_user_ns) t->neigh_vars[0].procname = NULL; switch (neigh_parms_family(p)) { case AF_INET: p_name = "ipv4"; break; case AF_INET6: p_name = "ipv6"; break; default: BUG(); } snprintf(neigh_path, sizeof(neigh_path), "net/%s/neigh/%s", p_name, dev_name_source); t->sysctl_header = register_net_sysctl(neigh_parms_net(p), neigh_path, t->neigh_vars); if (!t->sysctl_header) goto free; p->sysctl_table = t; return 0; free: kfree(t); err: return -ENOBUFS; } EXPORT_SYMBOL(neigh_sysctl_register); void neigh_sysctl_unregister(struct neigh_parms *p) { if (p->sysctl_table) { struct neigh_sysctl_table *t = p->sysctl_table; p->sysctl_table = NULL; unregister_net_sysctl_table(t->sysctl_header); kfree(t); } } EXPORT_SYMBOL(neigh_sysctl_unregister); #endif /* CONFIG_SYSCTL */ static int __init neigh_init(void) { rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETNEIGH, NULL, neigh_dump_info, 0); rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info, 0); rtnl_register(PF_UNSPEC, RTM_SETNEIGHTBL, neightbl_set, NULL, 0); return 0; } subsys_initcall(neigh_init); |
1 1 1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 | /** * \file drm_agpsupport.c * DRM support for AGP/GART backend * * \author Rickard E. (Rik) Faith <faith@valinux.com> * \author Gareth Hughes <gareth@valinux.com> */ /* * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas. * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California. * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #include <drm/drmP.h> #include <linux/module.h> #include <linux/slab.h> #include "drm_legacy.h" #include <asm/agp.h> /** * Get AGP information. * * \param inode device inode. * \param file_priv DRM file private. * \param cmd command. * \param arg pointer to a (output) drm_agp_info structure. * \return zero on success or a negative number on failure. * * Verifies the AGP device has been initialized and acquired and fills in the * drm_agp_info structure with the information in drm_agp_head::agp_info. */ int drm_agp_info(struct drm_device *dev, struct drm_agp_info *info) { struct agp_kern_info *kern; if (!dev->agp || !dev->agp->acquired) return -EINVAL; kern = &dev->agp->agp_info; info->agp_version_major = kern->version.major; info->agp_version_minor = kern->version.minor; info->mode = kern->mode; info->aperture_base = kern->aper_base; info->aperture_size = kern->aper_size * 1024 * 1024; info->memory_allowed = kern->max_memory << PAGE_SHIFT; info->memory_used = kern->current_memory << PAGE_SHIFT; info->id_vendor = kern->device->vendor; info->id_device = kern->device->device; return 0; } EXPORT_SYMBOL(drm_agp_info); int drm_agp_info_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_agp_info *info = data; int err; err = drm_agp_info(dev, info); if (err) return err; return 0; } /** * Acquire the AGP device. * * \param dev DRM device that is to acquire AGP. * \return zero on success or a negative number on failure. * * Verifies the AGP device hasn't been acquired before and calls * \c agp_backend_acquire. */ int drm_agp_acquire(struct drm_device *dev) { if (!dev->agp) return -ENODEV; if (dev->agp->acquired) return -EBUSY; dev->agp->bridge = agp_backend_acquire(dev->pdev); if (!dev->agp->bridge) return -ENODEV; dev->agp->acquired = 1; return 0; } EXPORT_SYMBOL(drm_agp_acquire); /** * Acquire the AGP device (ioctl). * * \param inode device inode. * \param file_priv DRM file private. * \param cmd command. * \param arg user argument. * \return zero on success or a negative number on failure. * * Verifies the AGP device hasn't been acquired before and calls * \c agp_backend_acquire. */ int drm_agp_acquire_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { return drm_agp_acquire((struct drm_device *) file_priv->minor->dev); } /** * Release the AGP device. * * \param dev DRM device that is to release AGP. * \return zero on success or a negative number on failure. * * Verifies the AGP device has been acquired and calls \c agp_backend_release. */ int drm_agp_release(struct drm_device *dev) { if (!dev->agp || !dev->agp->acquired) return -EINVAL; agp_backend_release(dev->agp->bridge); dev->agp->acquired = 0; return 0; } EXPORT_SYMBOL(drm_agp_release); int drm_agp_release_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { return drm_agp_release(dev); } /** * Enable the AGP bus. * * \param dev DRM device that has previously acquired AGP. * \param mode Requested AGP mode. * \return zero on success or a negative number on failure. * * Verifies the AGP device has been acquired but not enabled, and calls * \c agp_enable. */ int drm_agp_enable(struct drm_device *dev, struct drm_agp_mode mode) { if (!dev->agp || !dev->agp->acquired) return -EINVAL; dev->agp->mode = mode.mode; agp_enable(dev->agp->bridge, mode.mode); dev->agp->enabled = 1; return 0; } EXPORT_SYMBOL(drm_agp_enable); int drm_agp_enable_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_agp_mode *mode = data; return drm_agp_enable(dev, *mode); } /** * Allocate AGP memory. * * \param inode device inode. * \param file_priv file private pointer. * \param cmd command. * \param arg pointer to a drm_agp_buffer structure. * \return zero on success or a negative number on failure. * * Verifies the AGP device is present and has been acquired, allocates the * memory via agp_allocate_memory() and creates a drm_agp_mem entry for it. */ int drm_agp_alloc(struct drm_device *dev, struct drm_agp_buffer *request) { struct drm_agp_mem *entry; struct agp_memory *memory; unsigned long pages; u32 type; if (!dev->agp || !dev->agp->acquired) return -EINVAL; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; pages = (request->size + PAGE_SIZE - 1) / PAGE_SIZE; type = (u32) request->type; memory = agp_allocate_memory(dev->agp->bridge, pages, type); if (!memory) { kfree(entry); return -ENOMEM; } entry->handle = (unsigned long)memory->key + 1; entry->memory = memory; entry->bound = 0; entry->pages = pages; list_add(&entry->head, &dev->agp->memory); request->handle = entry->handle; request->physical = memory->physical; return 0; } EXPORT_SYMBOL(drm_agp_alloc); int drm_agp_alloc_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_agp_buffer *request = data; return drm_agp_alloc(dev, request); } /** * Search for the AGP memory entry associated with a handle. * * \param dev DRM device structure. * \param handle AGP memory handle. * \return pointer to the drm_agp_mem structure associated with \p handle. * * Walks through drm_agp_head::memory until finding a matching handle. */ static struct drm_agp_mem *drm_agp_lookup_entry(struct drm_device *dev, unsigned long handle) { struct drm_agp_mem *entry; list_for_each_entry(entry, &dev->agp->memory, head) { if (entry->handle == handle) return entry; } return NULL; } /** * Unbind AGP memory from the GATT (ioctl). * * \param inode device inode. * \param file_priv DRM file private. * \param cmd command. * \param arg pointer to a drm_agp_binding structure. * \return zero on success or a negative number on failure. * * Verifies the AGP device is present and acquired, looks-up the AGP memory * entry and passes it to the unbind_agp() function. */ int drm_agp_unbind(struct drm_device *dev, struct drm_agp_binding *request) { struct drm_agp_mem *entry; int ret; if (!dev->agp || !dev->agp->acquired) return -EINVAL; entry = drm_agp_lookup_entry(dev, request->handle); if (!entry || !entry->bound) return -EINVAL; ret = drm_unbind_agp(entry->memory); if (ret == 0) entry->bound = 0; return ret; } EXPORT_SYMBOL(drm_agp_unbind); int drm_agp_unbind_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_agp_binding *request = data; return drm_agp_unbind(dev, request); } /** * Bind AGP memory into the GATT (ioctl) * * \param inode device inode. * \param file_priv DRM file private. * \param cmd command. * \param arg pointer to a drm_agp_binding structure. * \return zero on success or a negative number on failure. * * Verifies the AGP device is present and has been acquired and that no memory * is currently bound into the GATT. Looks-up the AGP memory entry and passes * it to bind_agp() function. */ int drm_agp_bind(struct drm_device *dev, struct drm_agp_binding *request) { struct drm_agp_mem *entry; int retcode; int page; if (!dev->agp || !dev->agp->acquired) return -EINVAL; entry = drm_agp_lookup_entry(dev, request->handle); if (!entry || entry->bound) return -EINVAL; page = (request->offset + PAGE_SIZE - 1) / PAGE_SIZE; retcode = drm_bind_agp(entry->memory, page); if (retcode) return retcode; entry->bound = dev->agp->base + (page << PAGE_SHIFT); DRM_DEBUG("base = 0x%lx entry->bound = 0x%lx\n", dev->agp->base, entry->bound); return 0; } EXPORT_SYMBOL(drm_agp_bind); int drm_agp_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_agp_binding *request = data; return drm_agp_bind(dev, request); } /** * Free AGP memory (ioctl). * * \param inode device inode. * \param file_priv DRM file private. * \param cmd command. * \param arg pointer to a drm_agp_buffer structure. * \return zero on success or a negative number on failure. * * Verifies the AGP device is present and has been acquired and looks up the * AGP memory entry. If the memory it's currently bound, unbind it via * unbind_agp(). Frees it via free_agp() as well as the entry itself * and unlinks from the doubly linked list it's inserted in. */ int drm_agp_free(struct drm_device *dev, struct drm_agp_buffer *request) { struct drm_agp_mem *entry; if (!dev->agp || !dev->agp->acquired) return -EINVAL; entry = drm_agp_lookup_entry(dev, request->handle); if (!entry) return -EINVAL; if (entry->bound) drm_unbind_agp(entry->memory); list_del(&entry->head); drm_free_agp(entry->memory, entry->pages); kfree(entry); return 0; } EXPORT_SYMBOL(drm_agp_free); int drm_agp_free_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_agp_buffer *request = data; return drm_agp_free(dev, request); } /** * Initialize the AGP resources. * * \return pointer to a drm_agp_head structure. * * Gets the drm_agp_t structure which is made available by the agpgart module * via the inter_module_* functions. Creates and initializes a drm_agp_head * structure. * * Note that final cleanup of the kmalloced structure is directly done in * drm_pci_agp_destroy. */ struct drm_agp_head *drm_agp_init(struct drm_device *dev) { struct drm_agp_head *head = NULL; head = kzalloc(sizeof(*head), GFP_KERNEL); if (!head) return NULL; head->bridge = agp_find_bridge(dev->pdev); if (!head->bridge) { head->bridge = agp_backend_acquire(dev->pdev); if (!head->bridge) { kfree(head); return NULL; } agp_copy_info(head->bridge, &head->agp_info); agp_backend_release(head->bridge); } else { agp_copy_info(head->bridge, &head->agp_info); } if (head->agp_info.chipset == NOT_SUPPORTED) { kfree(head); return NULL; } INIT_LIST_HEAD(&head->memory); head->cant_use_aperture = head->agp_info.cant_use_aperture; head->page_mask = head->agp_info.page_mask; head->base = head->agp_info.aper_base; return head; } /* Only exported for i810.ko */ EXPORT_SYMBOL(drm_agp_init); /** * drm_legacy_agp_clear - Clear AGP resource list * @dev: DRM device * * Iterate over all AGP resources and remove them. But keep the AGP head * intact so it can still be used. It is safe to call this if AGP is disabled or * was already removed. * * Cleanup is only done for drivers who have DRIVER_LEGACY set. */ void drm_legacy_agp_clear(struct drm_device *dev) { struct drm_agp_mem *entry, *tempe; if (!dev->agp) return; if (!drm_core_check_feature(dev, DRIVER_LEGACY)) return; list_for_each_entry_safe(entry, tempe, &dev->agp->memory, head) { if (entry->bound) drm_unbind_agp(entry->memory); drm_free_agp(entry->memory, entry->pages); kfree(entry); } INIT_LIST_HEAD(&dev->agp->memory); if (dev->agp->acquired) drm_agp_release(dev); dev->agp->acquired = 0; dev->agp->enabled = 0; } /** * Binds a collection of pages into AGP memory at the given offset, returning * the AGP memory structure containing them. * * No reference is held on the pages during this time -- it is up to the * caller to handle that. */ struct agp_memory * drm_agp_bind_pages(struct drm_device *dev, struct page **pages, unsigned long num_pages, uint32_t gtt_offset, u32 type) { struct agp_memory *mem; int ret, i; DRM_DEBUG("\n"); mem = agp_allocate_memory(dev->agp->bridge, num_pages, type); if (mem == NULL) { DRM_ERROR("Failed to allocate memory for %ld pages\n", num_pages); return NULL; } for (i = 0; i < num_pages; i++) mem->pages[i] = pages[i]; mem->page_count = num_pages; mem->is_flushed = true; ret = agp_bind_memory(mem, gtt_offset / PAGE_SIZE); if (ret != 0) { DRM_ERROR("Failed to bind AGP memory: %d\n", ret); agp_free_memory(mem); return NULL; } return mem; } EXPORT_SYMBOL(drm_agp_bind_pages); |
99 2138 225 2138 11 72 13 72 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 | /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #include <linux/types.h> #include <linux/atomic.h> #include <linux/inetdevice.h> #include <linux/ip.h> #include <linux/timer.h> #include <linux/netfilter.h> #include <net/protocol.h> #include <net/ip.h> #include <net/checksum.h> #include <net/route.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter/x_tables.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/ipv4/nf_nat_masquerade.h> unsigned int nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, const struct nf_nat_range2 *range, const struct net_device *out) { struct nf_conn *ct; struct nf_conn_nat *nat; enum ip_conntrack_info ctinfo; struct nf_nat_range2 newrange; const struct rtable *rt; __be32 newsrc, nh; WARN_ON(hooknum != NF_INET_POST_ROUTING); ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY))); /* Source address is 0.0.0.0 - locally generated packet that is * probably not supposed to be masqueraded. */ if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0) return NF_ACCEPT; rt = skb_rtable(skb); nh = rt_nexthop(rt, ip_hdr(skb)->daddr); newsrc = inet_select_addr(out, nh, RT_SCOPE_UNIVERSE); if (!newsrc) { pr_info("%s ate my IP address\n", out->name); return NF_DROP; } nat = nf_ct_nat_ext_add(ct); if (nat) nat->masq_index = out->ifindex; /* Transfer from original range. */ memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; newrange.min_addr.ip = newsrc; newrange.max_addr.ip = newsrc; newrange.min_proto = range->min_proto; newrange.max_proto = range->max_proto; /* Hand modified range to generic setup. */ return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC); } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4); static int device_cmp(struct nf_conn *i, void *ifindex) { const struct nf_conn_nat *nat = nfct_nat(i); if (!nat) return 0; if (nf_ct_l3num(i) != NFPROTO_IPV4) return 0; return nat->masq_index == (int)(long)ifindex; } static int masq_device_event(struct notifier_block *this, unsigned long event, void *ptr) { const struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); if (event == NETDEV_DOWN) { /* Device was downed. Search entire table for * conntracks which were associated with that device, * and forget them. */ WARN_ON(dev->ifindex == 0); nf_ct_iterate_cleanup_net(net, device_cmp, (void *)(long)dev->ifindex, 0, 0); } return NOTIFY_DONE; } static int inet_cmp(struct nf_conn *ct, void *ptr) { struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; struct net_device *dev = ifa->ifa_dev->dev; struct nf_conntrack_tuple *tuple; if (!device_cmp(ct, (void *)(long)dev->ifindex)) return 0; tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; return ifa->ifa_address == tuple->dst.u3.ip; } static int masq_inet_event(struct notifier_block *this, unsigned long event, void *ptr) { struct in_device *idev = ((struct in_ifaddr *)ptr)->ifa_dev; struct net *net = dev_net(idev->dev); /* The masq_dev_notifier will catch the case of the device going * down. So if the inetdev is dead and being destroyed we have * no work to do. Otherwise this is an individual address removal * and we have to perform the flush. */ if (idev->dead) return NOTIFY_DONE; if (event == NETDEV_DOWN) nf_ct_iterate_cleanup_net(net, inet_cmp, ptr, 0, 0); return NOTIFY_DONE; } static struct notifier_block masq_dev_notifier = { .notifier_call = masq_device_event, }; static struct notifier_block masq_inet_notifier = { .notifier_call = masq_inet_event, }; static int masq_refcnt; static DEFINE_MUTEX(masq_mutex); int nf_nat_masquerade_ipv4_register_notifier(void) { int ret = 0; mutex_lock(&masq_mutex); /* check if the notifier was already set */ if (++masq_refcnt > 1) goto out_unlock; /* Register for device down reports */ ret = register_netdevice_notifier(&masq_dev_notifier); if (ret) goto err_dec; /* Register IP address change reports */ ret = register_inetaddr_notifier(&masq_inet_notifier); if (ret) goto err_unregister; mutex_unlock(&masq_mutex); return ret; err_unregister: unregister_netdevice_notifier(&masq_dev_notifier); err_dec: masq_refcnt--; out_unlock: mutex_unlock(&masq_mutex); return ret; } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_register_notifier); void nf_nat_masquerade_ipv4_unregister_notifier(void) { mutex_lock(&masq_mutex); /* check if the notifier still has clients */ if (--masq_refcnt > 0) goto out_unlock; unregister_netdevice_notifier(&masq_dev_notifier); unregister_inetaddr_notifier(&masq_inet_notifier); out_unlock: mutex_unlock(&masq_mutex); } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_unregister_notifier); |
162 2 162 22 7 7 7 22 25 25 3 2 3 20 49 41 49 40 40 37 20 11 25 14 14 3 3 38 1623 1623 15 505 325 325 325 72 140 243 129 124 116 275 275 268 116 3 2 3 2 2 2 161 162 162 162 162 162 156 2 162 162 162 162 162 162 157 157 2 2 133 11 133 133 133 133 133 133 29 29 29 29 29 29 132 133 133 133 25 3 2 1 2 5 2 2 7 3 5 5 4 3 2 2 2 1 1 1 1 1 30 30 30 29 32 28 28 28 28 28 28 26 25 26 13 8 13 1 1 5 6 25 26 2 2 28 28 4 24 3 3 25 4 28 14 2 28 25 26 26 24 14 13 14 13 14 14 26 26 10 26 8 7 6 2 8 26 3 3 27 13 13 11 10 7 3 3 62 62 62 62 59 57 5 7 1 32 13 192 460 459 394 103 4 2 2135 345 337 337 337 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 | /* * Neighbour Discovery for IPv6 * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * Mike Shaver <shaver@ingenia.com> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ /* * Changes: * * Alexey I. Froloff : RFC6106 (DNSSL) support * Pierre Ynard : export userland ND options * through netlink (RDNSS support) * Lars Fenneberg : fixed MTU setting on receipt * of an RA. * Janos Farkas : kmalloc failure checks * Alexey Kuznetsov : state machine reworked * and moved to net/core. * Pekka Savola : RFC2461 validation * YOSHIFUJI Hideaki @USAGI : Verify ND options properly */ #define pr_fmt(fmt) "ICMPv6: " fmt #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/sched.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/route.h> #include <linux/init.h> #include <linux/rcupdate.h> #include <linux/slab.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <linux/if_addr.h> #include <linux/if_ether.h> #include <linux/if_arp.h> #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/jhash.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/ndisc.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/icmp.h> #include <net/netlink.h> #include <linux/rtnetlink.h> #include <net/flow.h> #include <net/ip6_checksum.h> #include <net/inet_common.h> #include <linux/proc_fs.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> static u32 ndisc_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); static bool ndisc_key_eq(const struct neighbour *neigh, const void *pkey); static int ndisc_constructor(struct neighbour *neigh); static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb); static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb); static int pndisc_constructor(struct pneigh_entry *n); static void pndisc_destructor(struct pneigh_entry *n); static void pndisc_redo(struct sk_buff *skb); static const struct neigh_ops ndisc_generic_ops = { .family = AF_INET6, .solicit = ndisc_solicit, .error_report = ndisc_error_report, .output = neigh_resolve_output, .connected_output = neigh_connected_output, }; static const struct neigh_ops ndisc_hh_ops = { .family = AF_INET6, .solicit = ndisc_solicit, .error_report = ndisc_error_report, .output = neigh_resolve_output, .connected_output = neigh_resolve_output, }; static const struct neigh_ops ndisc_direct_ops = { .family = AF_INET6, .output = neigh_direct_output, .connected_output = neigh_direct_output, }; struct neigh_table nd_tbl = { .family = AF_INET6, .key_len = sizeof(struct in6_addr), .protocol = cpu_to_be16(ETH_P_IPV6), .hash = ndisc_hash, .key_eq = ndisc_key_eq, .constructor = ndisc_constructor, .pconstructor = pndisc_constructor, .pdestructor = pndisc_destructor, .proxy_redo = pndisc_redo, .id = "ndisc_cache", .parms = { .tbl = &nd_tbl, .reachable_time = ND_REACHABLE_TIME, .data = { [NEIGH_VAR_MCAST_PROBES] = 3, [NEIGH_VAR_UCAST_PROBES] = 3, [NEIGH_VAR_RETRANS_TIME] = ND_RETRANS_TIMER, [NEIGH_VAR_BASE_REACHABLE_TIME] = ND_REACHABLE_TIME, [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, [NEIGH_VAR_PROXY_QLEN] = 64, [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, }, }, .gc_interval = 30 * HZ, .gc_thresh1 = 128, .gc_thresh2 = 512, .gc_thresh3 = 1024, }; EXPORT_SYMBOL_GPL(nd_tbl); void __ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data, int data_len, int pad) { int space = __ndisc_opt_addr_space(data_len, pad); u8 *opt = skb_put(skb, space); opt[0] = type; opt[1] = space>>3; memset(opt + 2, 0, pad); opt += pad; space -= pad; memcpy(opt+2, data, data_len); data_len += 2; opt += data_len; space -= data_len; if (space > 0) memset(opt, 0, space); } EXPORT_SYMBOL_GPL(__ndisc_fill_addr_option); static inline void ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data, u8 icmp6_type) { __ndisc_fill_addr_option(skb, type, data, skb->dev->addr_len, ndisc_addr_option_pad(skb->dev->type)); ndisc_ops_fill_addr_option(skb->dev, skb, icmp6_type); } static inline void ndisc_fill_redirect_addr_option(struct sk_buff *skb, void *ha, const u8 *ops_data) { ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, ha, NDISC_REDIRECT); ndisc_ops_fill_redirect_addr_option(skb->dev, skb, ops_data); } static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, struct nd_opt_hdr *end) { int type; if (!cur || !end || cur >= end) return NULL; type = cur->nd_opt_type; do { cur = ((void *)cur) + (cur->nd_opt_len << 3); } while (cur < end && cur->nd_opt_type != type); return cur <= end && cur->nd_opt_type == type ? cur : NULL; } static inline int ndisc_is_useropt(const struct net_device *dev, struct nd_opt_hdr *opt) { return opt->nd_opt_type == ND_OPT_RDNSS || opt->nd_opt_type == ND_OPT_DNSSL || ndisc_ops_is_useropt(dev, opt->nd_opt_type); } static struct nd_opt_hdr *ndisc_next_useropt(const struct net_device *dev, struct nd_opt_hdr *cur, struct nd_opt_hdr *end) { if (!cur || !end || cur >= end) return NULL; do { cur = ((void *)cur) + (cur->nd_opt_len << 3); } while (cur < end && !ndisc_is_useropt(dev, cur)); return cur <= end && ndisc_is_useropt(dev, cur) ? cur : NULL; } struct ndisc_options *ndisc_parse_options(const struct net_device *dev, u8 *opt, int opt_len, struct ndisc_options *ndopts) { struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)opt; if (!nd_opt || opt_len < 0 || !ndopts) return NULL; memset(ndopts, 0, sizeof(*ndopts)); while (opt_len) { int l; if (opt_len < sizeof(struct nd_opt_hdr)) return NULL; l = nd_opt->nd_opt_len << 3; if (opt_len < l || l == 0) return NULL; if (ndisc_ops_parse_options(dev, nd_opt, ndopts)) goto next_opt; switch (nd_opt->nd_opt_type) { case ND_OPT_SOURCE_LL_ADDR: case ND_OPT_TARGET_LL_ADDR: case ND_OPT_MTU: case ND_OPT_NONCE: case ND_OPT_REDIRECT_HDR: if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) { ND_PRINTK(2, warn, "%s: duplicated ND6 option found: type=%d\n", __func__, nd_opt->nd_opt_type); } else { ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; } break; case ND_OPT_PREFIX_INFO: ndopts->nd_opts_pi_end = nd_opt; if (!ndopts->nd_opt_array[nd_opt->nd_opt_type]) ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; break; #ifdef CONFIG_IPV6_ROUTE_INFO case ND_OPT_ROUTE_INFO: ndopts->nd_opts_ri_end = nd_opt; if (!ndopts->nd_opts_ri) ndopts->nd_opts_ri = nd_opt; break; #endif default: if (ndisc_is_useropt(dev, nd_opt)) { ndopts->nd_useropts_end = nd_opt; if (!ndopts->nd_useropts) ndopts->nd_useropts = nd_opt; } else { /* * Unknown options must be silently ignored, * to accommodate future extension to the * protocol. */ ND_PRINTK(2, notice, "%s: ignored unsupported option; type=%d, len=%d\n", __func__, nd_opt->nd_opt_type, nd_opt->nd_opt_len); } } next_opt: opt_len -= l; nd_opt = ((void *)nd_opt) + l; } return ndopts; } int ndisc_mc_map(const struct in6_addr *addr, char *buf, struct net_device *dev, int dir) { switch (dev->type) { case ARPHRD_ETHER: case ARPHRD_IEEE802: /* Not sure. Check it later. --ANK */ case ARPHRD_FDDI: ipv6_eth_mc_map(addr, buf); return 0; case ARPHRD_ARCNET: ipv6_arcnet_mc_map(addr, buf); return 0; case ARPHRD_INFINIBAND: ipv6_ib_mc_map(addr, dev->broadcast, buf); return 0; case ARPHRD_IPGRE: return ipv6_ipgre_mc_map(addr, dev->broadcast, buf); default: if (dir) { memcpy(buf, dev->broadcast, dev->addr_len); return 0; } } return -EINVAL; } EXPORT_SYMBOL(ndisc_mc_map); static u32 ndisc_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd) { return ndisc_hashfn(pkey, dev, hash_rnd); } static bool ndisc_key_eq(const struct neighbour *n, const void *pkey) { return neigh_key_eq128(n, pkey); } static int ndisc_constructor(struct neighbour *neigh) { struct in6_addr *addr = (struct in6_addr *)&neigh->primary_key; struct net_device *dev = neigh->dev; struct inet6_dev *in6_dev; struct neigh_parms *parms; bool is_multicast = ipv6_addr_is_multicast(addr); in6_dev = in6_dev_get(dev); if (!in6_dev) { return -EINVAL; } parms = in6_dev->nd_parms; __neigh_parms_put(neigh->parms); neigh->parms = neigh_parms_clone(parms); neigh->type = is_multicast ? RTN_MULTICAST : RTN_UNICAST; if (!dev->header_ops) { neigh->nud_state = NUD_NOARP; neigh->ops = &ndisc_direct_ops; neigh->output = neigh_direct_output; } else { if (is_multicast) { neigh->nud_state = NUD_NOARP; ndisc_mc_map(addr, neigh->ha, dev, 1); } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->dev_addr, dev->addr_len); if (dev->flags&IFF_LOOPBACK) neigh->type = RTN_LOCAL; } else if (dev->flags&IFF_POINTOPOINT) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->broadcast, dev->addr_len); } if (dev->header_ops->cache) neigh->ops = &ndisc_hh_ops; else neigh->ops = &ndisc_generic_ops; if (neigh->nud_state&NUD_VALID) neigh->output = neigh->ops->connected_output; else neigh->output = neigh->ops->output; } in6_dev_put(in6_dev); return 0; } static int pndisc_constructor(struct pneigh_entry *n) { struct in6_addr *addr = (struct in6_addr *)&n->key; struct in6_addr maddr; struct net_device *dev = n->dev; if (!dev || !__in6_dev_get(dev)) return -EINVAL; addrconf_addr_solict_mult(addr, &maddr); ipv6_dev_mc_inc(dev, &maddr); return 0; } static void pndisc_destructor(struct pneigh_entry *n) { struct in6_addr *addr = (struct in6_addr *)&n->key; struct in6_addr maddr; struct net_device *dev = n->dev; if (!dev || !__in6_dev_get(dev)) return; addrconf_addr_solict_mult(addr, &maddr); ipv6_dev_mc_dec(dev, &maddr); } static struct sk_buff *ndisc_alloc_skb(struct net_device *dev, int len) { int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; struct sock *sk = dev_net(dev)->ipv6.ndisc_sk; struct sk_buff *skb; skb = alloc_skb(hlen + sizeof(struct ipv6hdr) + len + tlen, GFP_ATOMIC); if (!skb) { ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb\n", __func__); return NULL; } skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; skb_reserve(skb, hlen + sizeof(struct ipv6hdr)); skb_reset_transport_header(skb); /* Manually assign socket ownership as we avoid calling * sock_alloc_send_pskb() to bypass wmem buffer limits */ skb_set_owner_w(skb, sk); return skb; } static void ip6_nd_hdr(struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr, int hop_limit, int len) { struct ipv6hdr *hdr; struct inet6_dev *idev; unsigned tclass; rcu_read_lock(); idev = __in6_dev_get(skb->dev); tclass = idev ? idev->cnf.ndisc_tclass : 0; rcu_read_unlock(); skb_push(skb, sizeof(*hdr)); skb_reset_network_header(skb); hdr = ipv6_hdr(skb); ip6_flow_hdr(hdr, tclass, 0); hdr->payload_len = htons(len); hdr->nexthdr = IPPROTO_ICMPV6; hdr->hop_limit = hop_limit; hdr->saddr = *saddr; hdr->daddr = *daddr; } static void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, const struct in6_addr *saddr) { struct dst_entry *dst = skb_dst(skb); struct net *net = dev_net(skb->dev); struct sock *sk = net->ipv6.ndisc_sk; struct inet6_dev *idev; int err; struct icmp6hdr *icmp6h = icmp6_hdr(skb); u8 type; type = icmp6h->icmp6_type; if (!dst) { struct flowi6 fl6; int oif = skb->dev->ifindex; icmpv6_flow_init(sk, &fl6, type, saddr, daddr, oif); dst = icmp6_dst_alloc(skb->dev, &fl6); if (IS_ERR(dst)) { kfree_skb(skb); return; } skb_dst_set(skb, dst); } icmp6h->icmp6_cksum = csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, csum_partial(icmp6h, skb->len, 0)); ip6_nd_hdr(skb, saddr, daddr, inet6_sk(sk)->hop_limit, skb->len); rcu_read_lock(); idev = __in6_dev_get(dst->dev); IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, dst->dev, dst_output); if (!err) { ICMP6MSGOUT_INC_STATS(net, idev, type); ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); } rcu_read_unlock(); } void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr, const struct in6_addr *solicited_addr, bool router, bool solicited, bool override, bool inc_opt) { struct sk_buff *skb; struct in6_addr tmpaddr; struct inet6_ifaddr *ifp; const struct in6_addr *src_addr; struct nd_msg *msg; int optlen = 0; /* for anycast or proxy, solicited_addr != src_addr */ ifp = ipv6_get_ifaddr(dev_net(dev), solicited_addr, dev, 1); if (ifp) { src_addr = solicited_addr; if (ifp->flags & IFA_F_OPTIMISTIC) override = false; inc_opt |= ifp->idev->cnf.force_tllao; in6_ifa_put(ifp); } else { if (ipv6_dev_get_saddr(dev_net(dev), dev, daddr, inet6_sk(dev_net(dev)->ipv6.ndisc_sk)->srcprefs, &tmpaddr)) return; src_addr = &tmpaddr; } if (!dev->addr_len) inc_opt = false; if (inc_opt) optlen += ndisc_opt_addr_space(dev, NDISC_NEIGHBOUR_ADVERTISEMENT); skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!skb) return; msg = skb_put(skb, sizeof(*msg)); *msg = (struct nd_msg) { .icmph = { .icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT, .icmp6_router = router, .icmp6_solicited = solicited, .icmp6_override = override, }, .target = *solicited_addr, }; if (inc_opt) ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, dev->dev_addr, NDISC_NEIGHBOUR_ADVERTISEMENT); ndisc_send_skb(skb, daddr, src_addr); } static void ndisc_send_unsol_na(struct net_device *dev) { struct inet6_dev *idev; struct inet6_ifaddr *ifa; idev = in6_dev_get(dev); if (!idev) return; read_lock_bh(&idev->lock); list_for_each_entry(ifa, &idev->addr_list, if_list) { /* skip tentative addresses until dad completes */ if (ifa->flags & IFA_F_TENTATIVE && !(ifa->flags & IFA_F_OPTIMISTIC)) continue; ndisc_send_na(dev, &in6addr_linklocal_allnodes, &ifa->addr, /*router=*/ !!idev->cnf.forwarding, /*solicited=*/ false, /*override=*/ true, /*inc_opt=*/ true); } read_unlock_bh(&idev->lock); in6_dev_put(idev); } void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, const struct in6_addr *daddr, const struct in6_addr *saddr, u64 nonce) { struct sk_buff *skb; struct in6_addr addr_buf; int inc_opt = dev->addr_len; int optlen = 0; struct nd_msg *msg; if (!saddr) { if (ipv6_get_lladdr(dev, &addr_buf, (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC))) return; saddr = &addr_buf; } if (ipv6_addr_any(saddr)) inc_opt = false; if (inc_opt) optlen += ndisc_opt_addr_space(dev, NDISC_NEIGHBOUR_SOLICITATION); if (nonce != 0) optlen += 8; skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!skb) return; msg = skb_put(skb, sizeof(*msg)); *msg = (struct nd_msg) { .icmph = { .icmp6_type = NDISC_NEIGHBOUR_SOLICITATION, }, .target = *solicit, }; if (inc_opt) ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, dev->dev_addr, NDISC_NEIGHBOUR_SOLICITATION); if (nonce != 0) { u8 *opt = skb_put(skb, 8); opt[0] = ND_OPT_NONCE; opt[1] = 8 >> 3; memcpy(opt + 2, &nonce, 6); } ndisc_send_skb(skb, daddr, saddr); } void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, const struct in6_addr *daddr) { struct sk_buff *skb; struct rs_msg *msg; int send_sllao = dev->addr_len; int optlen = 0; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD /* * According to section 2.2 of RFC 4429, we must not * send router solicitations with a sllao from * optimistic addresses, but we may send the solicitation * if we don't include the sllao. So here we check * if our address is optimistic, and if so, we * suppress the inclusion of the sllao. */ if (send_sllao) { struct inet6_ifaddr *ifp = ipv6_get_ifaddr(dev_net(dev), saddr, dev, 1); if (ifp) { if (ifp->flags & IFA_F_OPTIMISTIC) { send_sllao = 0; } in6_ifa_put(ifp); } else { send_sllao = 0; } } #endif if (send_sllao) optlen += ndisc_opt_addr_space(dev, NDISC_ROUTER_SOLICITATION); skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!skb) return; msg = skb_put(skb, sizeof(*msg)); *msg = (struct rs_msg) { .icmph = { .icmp6_type = NDISC_ROUTER_SOLICITATION, }, }; if (send_sllao) ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, dev->dev_addr, NDISC_ROUTER_SOLICITATION); ndisc_send_skb(skb, daddr, saddr); } static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb) { /* * "The sender MUST return an ICMP * destination unreachable" */ dst_link_failure(skb); kfree_skb(skb); } /* Called with locked neigh: either read or both */ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) { struct in6_addr *saddr = NULL; struct in6_addr mcaddr; struct net_device *dev = neigh->dev; struct in6_addr *target = (struct in6_addr *)&neigh->primary_key; int probes = atomic_read(&neigh->probes); if (skb && ipv6_chk_addr_and_flags(dev_net(dev), &ipv6_hdr(skb)->saddr, dev, false, 1, IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) saddr = &ipv6_hdr(skb)->saddr; probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES); if (probes < 0) { if (!(neigh->nud_state & NUD_VALID)) { ND_PRINTK(1, dbg, "%s: trying to ucast probe in NUD_INVALID: %pI6\n", __func__, target); } ndisc_send_ns(dev, target, target, saddr, 0); } else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) { neigh_app_ns(neigh); } else { addrconf_addr_solict_mult(target, &mcaddr); ndisc_send_ns(dev, target, &mcaddr, saddr, 0); } } static int pndisc_is_router(const void *pkey, struct net_device *dev) { struct pneigh_entry *n; int ret = -1; read_lock_bh(&nd_tbl.lock); n = __pneigh_lookup(&nd_tbl, dev_net(dev), pkey, dev); if (n) ret = !!(n->flags & NTF_ROUTER); read_unlock_bh(&nd_tbl.lock); return ret; } void ndisc_update(const struct net_device *dev, struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u8 icmp6_type, struct ndisc_options *ndopts) { neigh_update(neigh, lladdr, new, flags, 0); /* report ndisc ops about neighbour update */ ndisc_ops_update(dev, neigh, flags, icmp6_type, ndopts); } static void ndisc_recv_ns(struct sk_buff *skb) { struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; u8 *lladdr = NULL; u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) + offsetof(struct nd_msg, opt)); struct ndisc_options ndopts; struct net_device *dev = skb->dev; struct inet6_ifaddr *ifp; struct inet6_dev *idev = NULL; struct neighbour *neigh; int dad = ipv6_addr_any(saddr); bool inc; int is_router = -1; u64 nonce = 0; if (skb->len < sizeof(struct nd_msg)) { ND_PRINTK(2, warn, "NS: packet too short\n"); return; } if (ipv6_addr_is_multicast(&msg->target)) { ND_PRINTK(2, warn, "NS: multicast target address\n"); return; } /* * RFC2461 7.1.1: * DAD has to be destined for solicited node multicast address. */ if (dad && !ipv6_addr_is_solict_mult(daddr)) { ND_PRINTK(2, warn, "NS: bad DAD packet (wrong destination)\n"); return; } if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) { ND_PRINTK(2, warn, "NS: invalid ND options\n"); return; } if (ndopts.nd_opts_src_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, dev); if (!lladdr) { ND_PRINTK(2, warn, "NS: invalid link-layer address length\n"); return; } /* RFC2461 7.1.1: * If the IP source address is the unspecified address, * there MUST NOT be source link-layer address option * in the message. */ if (dad) { ND_PRINTK(2, warn, "NS: bad DAD packet (link-layer address option)\n"); return; } } if (ndopts.nd_opts_nonce && ndopts.nd_opts_nonce->nd_opt_len == 1) memcpy(&nonce, (u8 *)(ndopts.nd_opts_nonce + 1), 6); inc = ipv6_addr_is_multicast(daddr); ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1); if (ifp) { have_ifp: if (ifp->flags & (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) { if (dad) { if (nonce != 0 && ifp->dad_nonce == nonce) { u8 *np = (u8 *)&nonce; /* Matching nonce if looped back */ ND_PRINTK(2, notice, "%s: IPv6 DAD loopback for address %pI6c nonce %pM ignored\n", ifp->idev->dev->name, &ifp->addr, np); goto out; } /* * We are colliding with another node * who is doing DAD * so fail our DAD process */ addrconf_dad_failure(skb, ifp); return; } else { /* * This is not a dad solicitation. * If we are an optimistic node, * we should respond. * Otherwise, we should ignore it. */ if (!(ifp->flags & IFA_F_OPTIMISTIC)) goto out; } } idev = ifp->idev; } else { struct net *net = dev_net(dev); /* perhaps an address on the master device */ if (netif_is_l3_slave(dev)) { struct net_device *mdev; mdev = netdev_master_upper_dev_get_rcu(dev); if (mdev) { ifp = ipv6_get_ifaddr(net, &msg->target, mdev, 1); if (ifp) goto have_ifp; } } idev = in6_dev_get(dev); if (!idev) { /* XXX: count this drop? */ return; } if (ipv6_chk_acast_addr(net, dev, &msg->target) || (idev->cnf.forwarding && (net->ipv6.devconf_all->proxy_ndp || idev->cnf.proxy_ndp) && (is_router = pndisc_is_router(&msg->target, dev)) >= 0)) { if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) && skb->pkt_type != PACKET_HOST && inc && NEIGH_VAR(idev->nd_parms, PROXY_DELAY) != 0) { /* * for anycast or proxy, * sender should delay its response * by a random time between 0 and * MAX_ANYCAST_DELAY_TIME seconds. * (RFC2461) -- yoshfuji */ struct sk_buff *n = skb_clone(skb, GFP_ATOMIC); if (n) pneigh_enqueue(&nd_tbl, idev->nd_parms, n); goto out; } } else goto out; } if (is_router < 0) is_router = idev->cnf.forwarding; if (dad) { ndisc_send_na(dev, &in6addr_linklocal_allnodes, &msg->target, !!is_router, false, (ifp != NULL), true); goto out; } if (inc) NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_mcast); else NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_ucast); /* * update / create cache entry * for the source address */ neigh = __neigh_lookup(&nd_tbl, saddr, dev, !inc || lladdr || !dev->addr_len); if (neigh) ndisc_update(dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE, NDISC_NEIGHBOUR_SOLICITATION, &ndopts); if (neigh || !dev->header_ops) { ndisc_send_na(dev, saddr, &msg->target, !!is_router, true, (ifp != NULL && inc), inc); if (neigh) neigh_release(neigh); } out: if (ifp) in6_ifa_put(ifp); else in6_dev_put(idev); } static void ndisc_recv_na(struct sk_buff *skb) { struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; u8 *lladdr = NULL; u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) + offsetof(struct nd_msg, opt)); struct ndisc_options ndopts; struct net_device *dev = skb->dev; struct inet6_dev *idev = __in6_dev_get(dev); struct inet6_ifaddr *ifp; struct neighbour *neigh; if (skb->len < sizeof(struct nd_msg)) { ND_PRINTK(2, warn, "NA: packet too short\n"); return; } if (ipv6_addr_is_multicast(&msg->target)) { ND_PRINTK(2, warn, "NA: target address is multicast\n"); return; } if (ipv6_addr_is_multicast(daddr) && msg->icmph.icmp6_solicited) { ND_PRINTK(2, warn, "NA: solicited NA is multicasted\n"); return; } /* For some 802.11 wireless deployments (and possibly other networks), * there will be a NA proxy and unsolicitd packets are attacks * and thus should not be accepted. */ if (!msg->icmph.icmp6_solicited && idev && idev->cnf.drop_unsolicited_na) return; if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) { ND_PRINTK(2, warn, "NS: invalid ND option\n"); return; } if (ndopts.nd_opts_tgt_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, dev); if (!lladdr) { ND_PRINTK(2, warn, "NA: invalid link-layer address length\n"); return; } } ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1); if (ifp) { if (skb->pkt_type != PACKET_LOOPBACK && (ifp->flags & IFA_F_TENTATIVE)) { addrconf_dad_failure(skb, ifp); return; } /* What should we make now? The advertisement is invalid, but ndisc specs say nothing about it. It could be misconfiguration, or an smart proxy agent tries to help us :-) We should not print the error if NA has been received from loopback - it is just our own unsolicited advertisement. */ if (skb->pkt_type != PACKET_LOOPBACK) ND_PRINTK(1, warn, "NA: %pM advertised our address %pI6c on %s!\n", eth_hdr(skb)->h_source, &ifp->addr, ifp->idev->dev->name); in6_ifa_put(ifp); return; } neigh = neigh_lookup(&nd_tbl, &msg->target, dev); if (neigh) { u8 old_flags = neigh->flags; struct net *net = dev_net(dev); if (neigh->nud_state & NUD_FAILED) goto out; /* * Don't update the neighbor cache entry on a proxy NA from * ourselves because either the proxied node is off link or it * has already sent a NA to us. */ if (lladdr && !memcmp(lladdr, dev->dev_addr, dev->addr_len) && net->ipv6.devconf_all->forwarding && net->ipv6.devconf_all->proxy_ndp && pneigh_lookup(&nd_tbl, net, &msg->target, dev, 0)) { /* XXX: idev->cnf.proxy_ndp */ goto out; } ndisc_update(dev, neigh, lladdr, msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| (msg->icmph.icmp6_override ? NEIGH_UPDATE_F_OVERRIDE : 0)| NEIGH_UPDATE_F_OVERRIDE_ISROUTER| (msg->icmph.icmp6_router ? NEIGH_UPDATE_F_ISROUTER : 0), NDISC_NEIGHBOUR_ADVERTISEMENT, &ndopts); if ((old_flags & ~neigh->flags) & NTF_ROUTER) { /* * Change: router to host */ rt6_clean_tohost(dev_net(dev), saddr); } out: neigh_release(neigh); } } static void ndisc_recv_rs(struct sk_buff *skb) { struct rs_msg *rs_msg = (struct rs_msg *)skb_transport_header(skb); unsigned long ndoptlen = skb->len - sizeof(*rs_msg); struct neighbour *neigh; struct inet6_dev *idev; const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; struct ndisc_options ndopts; u8 *lladdr = NULL; if (skb->len < sizeof(*rs_msg)) return; idev = __in6_dev_get(skb->dev); if (!idev) { ND_PRINTK(1, err, "RS: can't find in6 device\n"); return; } /* Don't accept RS if we're not in router mode */ if (!idev->cnf.forwarding) goto out; /* * Don't update NCE if src = ::; * this implies that the source node has no ip address assigned yet. */ if (ipv6_addr_any(saddr)) goto out; /* Parse ND options */ if (!ndisc_parse_options(skb->dev, rs_msg->opt, ndoptlen, &ndopts)) { ND_PRINTK(2, notice, "NS: invalid ND option, ignored\n"); goto out; } if (ndopts.nd_opts_src_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, skb->dev); if (!lladdr) goto out; } neigh = __neigh_lookup(&nd_tbl, saddr, skb->dev, 1); if (neigh) { ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE_ISROUTER, NDISC_ROUTER_SOLICITATION, &ndopts); neigh_release(neigh); } out: return; } static void ndisc_ra_useropt(struct sk_buff *ra, struct nd_opt_hdr *opt) { struct icmp6hdr *icmp6h = (struct icmp6hdr *)skb_transport_header(ra); struct sk_buff *skb; struct nlmsghdr *nlh; struct nduseroptmsg *ndmsg; struct net *net = dev_net(ra->dev); int err; int base_size = NLMSG_ALIGN(sizeof(struct nduseroptmsg) + (opt->nd_opt_len << 3)); size_t msg_size = base_size + nla_total_size(sizeof(struct in6_addr)); skb = nlmsg_new(msg_size, GFP_ATOMIC); if (!skb) { err = -ENOBUFS; goto errout; } nlh = nlmsg_put(skb, 0, 0, RTM_NEWNDUSEROPT, base_size, 0); if (!nlh) { goto nla_put_failure; } ndmsg = nlmsg_data(nlh); ndmsg->nduseropt_family = AF_INET6; ndmsg->nduseropt_ifindex = ra->dev->ifindex; ndmsg->nduseropt_icmp_type = icmp6h->icmp6_type; ndmsg->nduseropt_icmp_code = icmp6h->icmp6_code; ndmsg->nduseropt_opts_len = opt->nd_opt_len << 3; memcpy(ndmsg + 1, opt, opt->nd_opt_len << 3); if (nla_put_in6_addr(skb, NDUSEROPT_SRCADDR, &ipv6_hdr(ra)->saddr)) goto nla_put_failure; nlmsg_end(skb, nlh); rtnl_notify(skb, net, 0, RTNLGRP_ND_USEROPT, NULL, GFP_ATOMIC); return; nla_put_failure: nlmsg_free(skb); err = -EMSGSIZE; errout: rtnl_set_sk_err(net, RTNLGRP_ND_USEROPT, err); } static void ndisc_router_discovery(struct sk_buff *skb) { struct ra_msg *ra_msg = (struct ra_msg *)skb_transport_header(skb); struct neighbour *neigh = NULL; struct inet6_dev *in6_dev; struct fib6_info *rt = NULL; struct net *net; int lifetime; struct ndisc_options ndopts; int optlen; unsigned int pref = 0; __u32 old_if_flags; bool send_ifinfo_notify = false; __u8 *opt = (__u8 *)(ra_msg + 1); optlen = (skb_tail_pointer(skb) - skb_transport_header(skb)) - sizeof(struct ra_msg); ND_PRINTK(2, info, "RA: %s, dev: %s\n", __func__, skb->dev->name); if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { ND_PRINTK(2, warn, "RA: source address is not link-local\n"); return; } if (optlen < 0) { ND_PRINTK(2, warn, "RA: packet too short\n"); return; } #ifdef CONFIG_IPV6_NDISC_NODETYPE if (skb->ndisc_nodetype == NDISC_NODETYPE_HOST) { ND_PRINTK(2, warn, "RA: from host or unauthorized router\n"); return; } #endif /* * set the RA_RECV flag in the interface */ in6_dev = __in6_dev_get(skb->dev); if (!in6_dev) { ND_PRINTK(0, err, "RA: can't find inet6 device for %s\n", skb->dev->name); return; } if (!ndisc_parse_options(skb->dev, opt, optlen, &ndopts)) { ND_PRINTK(2, warn, "RA: invalid ND options\n"); return; } if (!ipv6_accept_ra(in6_dev)) { ND_PRINTK(2, info, "RA: %s, did not accept ra for dev: %s\n", __func__, skb->dev->name); goto skip_linkparms; } #ifdef CONFIG_IPV6_NDISC_NODETYPE /* skip link-specific parameters from interior routers */ if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) { ND_PRINTK(2, info, "RA: %s, nodetype is NODEFAULT, dev: %s\n", __func__, skb->dev->name); goto skip_linkparms; } #endif if (in6_dev->if_flags & IF_RS_SENT) { /* * flag that an RA was received after an RS was sent * out on this interface. */ in6_dev->if_flags |= IF_RA_RCVD; } /* * Remember the managed/otherconf flags from most recently * received RA message (RFC 2462) -- yoshfuji */ old_if_flags = in6_dev->if_flags; in6_dev->if_flags = (in6_dev->if_flags & ~(IF_RA_MANAGED | IF_RA_OTHERCONF)) | (ra_msg->icmph.icmp6_addrconf_managed ? IF_RA_MANAGED : 0) | (ra_msg->icmph.icmp6_addrconf_other ? IF_RA_OTHERCONF : 0); if (old_if_flags != in6_dev->if_flags) send_ifinfo_notify = true; if (!in6_dev->cnf.accept_ra_defrtr) { ND_PRINTK(2, info, "RA: %s, defrtr is false for dev: %s\n", __func__, skb->dev->name); goto skip_defrtr; } /* Do not accept RA with source-addr found on local machine unless * accept_ra_from_local is set to true. */ net = dev_net(in6_dev->dev); if (!in6_dev->cnf.accept_ra_from_local && ipv6_chk_addr(net, &ipv6_hdr(skb)->saddr, in6_dev->dev, 0)) { ND_PRINTK(2, info, "RA from local address detected on dev: %s: default router ignored\n", skb->dev->name); goto skip_defrtr; } lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime); #ifdef CONFIG_IPV6_ROUTER_PREF pref = ra_msg->icmph.icmp6_router_pref; /* 10b is handled as if it were 00b (medium) */ if (pref == ICMPV6_ROUTER_PREF_INVALID || !in6_dev->cnf.accept_ra_rtr_pref) pref = ICMPV6_ROUTER_PREF_MEDIUM; #endif rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev); if (rt) { neigh = ip6_neigh_lookup(&rt->fib6_nh.nh_gw, rt->fib6_nh.nh_dev, NULL, &ipv6_hdr(skb)->saddr); if (!neigh) { ND_PRINTK(0, err, "RA: %s got default router without neighbour\n", __func__); fib6_info_release(rt); return; } } if (rt && lifetime == 0) { ip6_del_rt(net, rt); rt = NULL; } ND_PRINTK(3, info, "RA: rt: %p lifetime: %d, for dev: %s\n", rt, lifetime, skb->dev->name); if (!rt && lifetime) { ND_PRINTK(3, info, "RA: adding default router\n"); rt = rt6_add_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev, pref); if (!rt) { ND_PRINTK(0, err, "RA: %s failed to add default route\n", __func__); return; } neigh = ip6_neigh_lookup(&rt->fib6_nh.nh_gw, rt->fib6_nh.nh_dev, NULL, &ipv6_hdr(skb)->saddr); if (!neigh) { ND_PRINTK(0, err, "RA: %s got default router without neighbour\n", __func__); fib6_info_release(rt); return; } neigh->flags |= NTF_ROUTER; } else if (rt) { rt->fib6_flags = (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); } if (rt) fib6_set_expires(rt, jiffies + (HZ * lifetime)); if (in6_dev->cnf.accept_ra_min_hop_limit < 256 && ra_msg->icmph.icmp6_hop_limit) { if (in6_dev->cnf.accept_ra_min_hop_limit <= ra_msg->icmph.icmp6_hop_limit) { in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit; fib6_metric_set(rt, RTAX_HOPLIMIT, ra_msg->icmph.icmp6_hop_limit); } else { ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than minimum\n"); } } skip_defrtr: /* * Update Reachable Time and Retrans Timer */ if (in6_dev->nd_parms) { unsigned long rtime = ntohl(ra_msg->retrans_timer); if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/HZ) { rtime = (rtime*HZ)/1000; if (rtime < HZ/10) rtime = HZ/10; NEIGH_VAR_SET(in6_dev->nd_parms, RETRANS_TIME, rtime); in6_dev->tstamp = jiffies; send_ifinfo_notify = true; } rtime = ntohl(ra_msg->reachable_time); if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/(3*HZ)) { rtime = (rtime*HZ)/1000; if (rtime < HZ/10) rtime = HZ/10; if (rtime != NEIGH_VAR(in6_dev->nd_parms, BASE_REACHABLE_TIME)) { NEIGH_VAR_SET(in6_dev->nd_parms, BASE_REACHABLE_TIME, rtime); NEIGH_VAR_SET(in6_dev->nd_parms, GC_STALETIME, 3 * rtime); in6_dev->nd_parms->reachable_time = neigh_rand_reach_time(rtime); in6_dev->tstamp = jiffies; send_ifinfo_notify = true; } } } /* * Send a notify if RA changed managed/otherconf flags or timer settings */ if (send_ifinfo_notify) inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); skip_linkparms: /* * Process options. */ if (!neigh) neigh = __neigh_lookup(&nd_tbl, &ipv6_hdr(skb)->saddr, skb->dev, 1); if (neigh) { u8 *lladdr = NULL; if (ndopts.nd_opts_src_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, skb->dev); if (!lladdr) { ND_PRINTK(2, warn, "RA: invalid link-layer address length\n"); goto out; } } ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE_ISROUTER| NEIGH_UPDATE_F_ISROUTER, NDISC_ROUTER_ADVERTISEMENT, &ndopts); } if (!ipv6_accept_ra(in6_dev)) { ND_PRINTK(2, info, "RA: %s, accept_ra is false for dev: %s\n", __func__, skb->dev->name); goto out; } #ifdef CONFIG_IPV6_ROUTE_INFO if (!in6_dev->cnf.accept_ra_from_local && ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, in6_dev->dev, 0)) { ND_PRINTK(2, info, "RA from local address detected on dev: %s: router info ignored.\n", skb->dev->name); goto skip_routeinfo; } if (in6_dev->cnf.accept_ra_rtr_pref && ndopts.nd_opts_ri) { struct nd_opt_hdr *p; for (p = ndopts.nd_opts_ri; p; p = ndisc_next_option(p, ndopts.nd_opts_ri_end)) { struct route_info *ri = (struct route_info *)p; #ifdef CONFIG_IPV6_NDISC_NODETYPE if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT && ri->prefix_len == 0) continue; #endif if (ri->prefix_len == 0 && !in6_dev->cnf.accept_ra_defrtr) continue; if (ri->prefix_len < in6_dev->cnf.accept_ra_rt_info_min_plen) continue; if (ri->prefix_len > in6_dev->cnf.accept_ra_rt_info_max_plen) continue; rt6_route_rcv(skb->dev, (u8 *)p, (p->nd_opt_len) << 3, &ipv6_hdr(skb)->saddr); } } skip_routeinfo: #endif #ifdef CONFIG_IPV6_NDISC_NODETYPE /* skip link-specific ndopts from interior routers */ if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) { ND_PRINTK(2, info, "RA: %s, nodetype is NODEFAULT (interior routes), dev: %s\n", __func__, skb->dev->name); goto out; } #endif if (in6_dev->cnf.accept_ra_pinfo && ndopts.nd_opts_pi) { struct nd_opt_hdr *p; for (p = ndopts.nd_opts_pi; p; p = ndisc_next_option(p, ndopts.nd_opts_pi_end)) { addrconf_prefix_rcv(skb->dev, (u8 *)p, (p->nd_opt_len) << 3, ndopts.nd_opts_src_lladdr != NULL); } } if (ndopts.nd_opts_mtu && in6_dev->cnf.accept_ra_mtu) { __be32 n; u32 mtu; memcpy(&n, ((u8 *)(ndopts.nd_opts_mtu+1))+2, sizeof(mtu)); mtu = ntohl(n); if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) { ND_PRINTK(2, warn, "RA: invalid mtu: %d\n", mtu); } else if (in6_dev->cnf.mtu6 != mtu) { in6_dev->cnf.mtu6 = mtu; fib6_metric_set(rt, RTAX_MTU, mtu); rt6_mtu_change(skb->dev, mtu); } } if (ndopts.nd_useropts) { struct nd_opt_hdr *p; for (p = ndopts.nd_useropts; p; p = ndisc_next_useropt(skb->dev, p, ndopts.nd_useropts_end)) { ndisc_ra_useropt(skb, p); } } if (ndopts.nd_opts_tgt_lladdr || ndopts.nd_opts_rh) { ND_PRINTK(2, warn, "RA: invalid RA options\n"); } out: fib6_info_release(rt); if (neigh) neigh_release(neigh); } static void ndisc_redirect_rcv(struct sk_buff *skb) { u8 *hdr; struct ndisc_options ndopts; struct rd_msg *msg = (struct rd_msg *)skb_transport_header(skb); u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) + offsetof(struct rd_msg, opt)); #ifdef CONFIG_IPV6_NDISC_NODETYPE switch (skb->ndisc_nodetype) { case NDISC_NODETYPE_HOST: case NDISC_NODETYPE_NODEFAULT: ND_PRINTK(2, warn, "Redirect: from host or unauthorized router\n"); return; } #endif if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { ND_PRINTK(2, warn, "Redirect: source address is not link-local\n"); return; } if (!ndisc_parse_options(skb->dev, msg->opt, ndoptlen, &ndopts)) return; if (!ndopts.nd_opts_rh) { ip6_redirect_no_header(skb, dev_net(skb->dev), skb->dev->ifindex, 0); return; } hdr = (u8 *)ndopts.nd_opts_rh; hdr += 8; if (!pskb_pull(skb, hdr - skb_transport_header(skb))) return; icmpv6_notify(skb, NDISC_REDIRECT, 0, 0); } static void ndisc_fill_redirect_hdr_option(struct sk_buff *skb, struct sk_buff *orig_skb, int rd_len) { u8 *opt = skb_put(skb, rd_len); memset(opt, 0, 8); *(opt++) = ND_OPT_REDIRECT_HDR; *(opt++) = (rd_len >> 3); opt += 6; skb_copy_bits(orig_skb, skb_network_offset(orig_skb), opt, rd_len - 8); } void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) { struct net_device *dev = skb->dev; struct net *net = dev_net(dev); struct sock *sk = net->ipv6.ndisc_sk; int optlen = 0; struct inet_peer *peer; struct sk_buff *buff; struct rd_msg *msg; struct in6_addr saddr_buf; struct rt6_info *rt; struct dst_entry *dst; struct flowi6 fl6; int rd_len; u8 ha_buf[MAX_ADDR_LEN], *ha = NULL, ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL; bool ret; if (netif_is_l3_master(skb->dev)) { dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif); if (!dev) return; } if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) { ND_PRINTK(2, warn, "Redirect: no link-local address on %s\n", dev->name); return; } if (!ipv6_addr_equal(&ipv6_hdr(skb)->daddr, target) && ipv6_addr_type(target) != (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { ND_PRINTK(2, warn, "Redirect: target address is not link-local unicast\n"); return; } icmpv6_flow_init(sk, &fl6, NDISC_REDIRECT, &saddr_buf, &ipv6_hdr(skb)->saddr, dev->ifindex); dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { dst_release(dst); return; } dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); if (IS_ERR(dst)) return; rt = (struct rt6_info *) dst; if (rt->rt6i_flags & RTF_GATEWAY) { ND_PRINTK(2, warn, "Redirect: destination is not a neighbour\n"); goto release; } peer = inet_getpeer_v6(net->ipv6.peers, &ipv6_hdr(skb)->saddr, 1); ret = inet_peer_xrlim_allow(peer, 1*HZ); if (peer) inet_putpeer(peer); if (!ret) goto release; if (dev->addr_len) { struct neighbour *neigh = dst_neigh_lookup(skb_dst(skb), target); if (!neigh) { ND_PRINTK(2, warn, "Redirect: no neigh for target address\n"); goto release; } read_lock_bh(&neigh->lock); if (neigh->nud_state & NUD_VALID) { memcpy(ha_buf, neigh->ha, dev->addr_len); read_unlock_bh(&neigh->lock); ha = ha_buf; optlen += ndisc_redirect_opt_addr_space(dev, neigh, ops_data_buf, &ops_data); } else read_unlock_bh(&neigh->lock); neigh_release(neigh); } rd_len = min_t(unsigned int, IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(*msg) - optlen, skb->len + 8); rd_len &= ~0x7; optlen += rd_len; buff = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!buff) goto release; msg = skb_put(buff, sizeof(*msg)); *msg = (struct rd_msg) { .icmph = { .icmp6_type = NDISC_REDIRECT, }, .target = *target, .dest = ipv6_hdr(skb)->daddr, }; /* * include target_address option */ if (ha) ndisc_fill_redirect_addr_option(buff, ha, ops_data); /* * build redirect option and copy skb over to the new packet. */ if (rd_len) ndisc_fill_redirect_hdr_option(buff, skb, rd_len); skb_dst_set(buff, dst); ndisc_send_skb(buff, &ipv6_hdr(skb)->saddr, &saddr_buf); return; release: dst_release(dst); } static void pndisc_redo(struct sk_buff *skb) { ndisc_recv_ns(skb); kfree_skb(skb); } static bool ndisc_suppress_frag_ndisc(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); if (!idev) return true; if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED && idev->cnf.suppress_frag_ndisc) { net_warn_ratelimited("Received fragmented ndisc packet. Carefully consider disabling suppress_frag_ndisc.\n"); return true; } return false; } int ndisc_rcv(struct sk_buff *skb) { struct nd_msg *msg; if (ndisc_suppress_frag_ndisc(skb)) return 0; if (skb_linearize(skb)) return 0; msg = (struct nd_msg *)skb_transport_header(skb); __skb_push(skb, skb->data - skb_transport_header(skb)); if (ipv6_hdr(skb)->hop_limit != 255) { ND_PRINTK(2, warn, "NDISC: invalid hop-limit: %d\n", ipv6_hdr(skb)->hop_limit); return 0; } if (msg->icmph.icmp6_code != 0) { ND_PRINTK(2, warn, "NDISC: invalid ICMPv6 code: %d\n", msg->icmph.icmp6_code); return 0; } switch (msg->icmph.icmp6_type) { case NDISC_NEIGHBOUR_SOLICITATION: memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); ndisc_recv_ns(skb); break; case NDISC_NEIGHBOUR_ADVERTISEMENT: ndisc_recv_na(skb); break; case NDISC_ROUTER_SOLICITATION: ndisc_recv_rs(skb); break; case NDISC_ROUTER_ADVERTISEMENT: ndisc_router_discovery(skb); break; case NDISC_REDIRECT: ndisc_redirect_rcv(skb); break; } return 0; } static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netdev_notifier_change_info *change_info; struct net *net = dev_net(dev); struct inet6_dev *idev; switch (event) { case NETDEV_CHANGEADDR: neigh_changeaddr(&nd_tbl, dev); fib6_run_gc(0, net, false); /* fallthrough */ case NETDEV_UP: idev = in6_dev_get(dev); if (!idev) break; if (idev->cnf.ndisc_notify || net->ipv6.devconf_all->ndisc_notify) ndisc_send_unsol_na(dev); in6_dev_put(idev); break; case NETDEV_CHANGE: change_info = ptr; if (change_info->flags_changed & IFF_NOARP) neigh_changeaddr(&nd_tbl, dev); break; case NETDEV_DOWN: neigh_ifdown(&nd_tbl, dev); fib6_run_gc(0, net, false); break; case NETDEV_NOTIFY_PEERS: ndisc_send_unsol_na(dev); break; default: break; } return NOTIFY_DONE; } static struct notifier_block ndisc_netdev_notifier = { .notifier_call = ndisc_netdev_event, .priority = ADDRCONF_NOTIFY_PRIORITY - 5, }; #ifdef CONFIG_SYSCTL static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl, const char *func, const char *dev_name) { static char warncomm[TASK_COMM_LEN]; static int warned; if (strcmp(warncomm, current->comm) && warned < 5) { strcpy(warncomm, current->comm); pr_warn("process `%s' is using deprecated sysctl (%s) net.ipv6.neigh.%s.%s - use net.ipv6.neigh.%s.%s_ms instead\n", warncomm, func, dev_name, ctl->procname, dev_name, ctl->procname); warned++; } } int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct net_device *dev = ctl->extra1; struct inet6_dev *idev; int ret; if ((strcmp(ctl->procname, "retrans_time") == 0) || (strcmp(ctl->procname, "base_reachable_time") == 0)) ndisc_warn_deprecated_sysctl(ctl, "syscall", dev ? dev->name : "default"); if (strcmp(ctl->procname, "retrans_time") == 0) ret = neigh_proc_dointvec(ctl, write, buffer, lenp, ppos); else if (strcmp(ctl->procname, "base_reachable_time") == 0) ret = neigh_proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); else if ((strcmp(ctl->procname, "retrans_time_ms") == 0) || (strcmp(ctl->procname, "base_reachable_time_ms") == 0)) ret = neigh_proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); else ret = -1; if (write && ret == 0 && dev && (idev = in6_dev_get(dev)) != NULL) { if (ctl->data == &NEIGH_VAR(idev->nd_parms, BASE_REACHABLE_TIME)) idev->nd_parms->reachable_time = neigh_rand_reach_time(NEIGH_VAR(idev->nd_parms, BASE_REACHABLE_TIME)); idev->tstamp = jiffies; inet6_ifinfo_notify(RTM_NEWLINK, idev); in6_dev_put(idev); } return ret; } #endif static int __net_init ndisc_net_init(struct net *net) { struct ipv6_pinfo *np; struct sock *sk; int err; err = inet_ctl_sock_create(&sk, PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, net); if (err < 0) { ND_PRINTK(0, err, "NDISC: Failed to initialize the control socket (err %d)\n", err); return err; } net->ipv6.ndisc_sk = sk; np = inet6_sk(sk); np->hop_limit = 255; /* Do not loopback ndisc messages */ np->mc_loop = 0; return 0; } static void __net_exit ndisc_net_exit(struct net *net) { inet_ctl_sock_destroy(net->ipv6.ndisc_sk); } static struct pernet_operations ndisc_net_ops = { .init = ndisc_net_init, .exit = ndisc_net_exit, }; int __init ndisc_init(void) { int err; err = register_pernet_subsys(&ndisc_net_ops); if (err) return err; /* * Initialize the neighbour table */ neigh_table_init(NEIGH_ND_TABLE, &nd_tbl); #ifdef CONFIG_SYSCTL err = neigh_sysctl_register(NULL, &nd_tbl.parms, ndisc_ifinfo_sysctl_change); if (err) goto out_unregister_pernet; out: #endif return err; #ifdef CONFIG_SYSCTL out_unregister_pernet: unregister_pernet_subsys(&ndisc_net_ops); goto out; #endif } int __init ndisc_late_init(void) { return register_netdevice_notifier(&ndisc_netdev_notifier); } void ndisc_late_cleanup(void) { unregister_netdevice_notifier(&ndisc_netdev_notifier); } void ndisc_cleanup(void) { #ifdef CONFIG_SYSCTL neigh_sysctl_unregister(&nd_tbl.parms); #endif neigh_table_clear(NEIGH_ND_TABLE, &nd_tbl); unregister_pernet_subsys(&ndisc_net_ops); } |
399 399 398 398 398 398 398 525 525 85 84 84 40 84 84 585 335 568 270 44 567 521 232 521 20 20 20 20 20 20 20 20 20 20 20 20 20 20 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 | /* SCTP kernel implementation * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001-2002 International Business Machines, Corp. * Copyright (c) 2001 Intel Corp. * Copyright (c) 2001 Nokia, Inc. * Copyright (c) 2001 La Monte H.P. Yarroll * * This file is part of the SCTP kernel implementation * * This abstraction represents an SCTP endpoint. * * The SCTP implementation is free software; * you can redistribute it and/or modify it under the terms of * the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * * The SCTP implementation is distributed in the hope that it * will be useful, but WITHOUT ANY WARRANTY; without even the implied * ************************ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with GNU CC; see the file COPYING. If not, see * <http://www.gnu.org/licenses/>. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@austin.ibm.com> * Daisy Chang <daisyc@us.ibm.com> * Dajiang Zhang <dajiang.zhang@nokia.com> */ #include <linux/types.h> #include <linux/slab.h> #include <linux/in.h> #include <linux/random.h> /* get_random_bytes() */ #include <net/sock.h> #include <net/ipv6.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> /* Forward declarations for internal helpers. */ static void sctp_endpoint_bh_rcv(struct work_struct *work); /* * Initialize the base fields of the endpoint structure. */ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, struct sock *sk, gfp_t gfp) { struct net *net = sock_net(sk); struct sctp_hmac_algo_param *auth_hmacs = NULL; struct sctp_chunks_param *auth_chunks = NULL; struct sctp_shared_key *null_key; int err; ep->digest = kzalloc(SCTP_SIGNATURE_SIZE, gfp); if (!ep->digest) return NULL; ep->auth_enable = net->sctp.auth_enable; if (ep->auth_enable) { /* Allocate space for HMACS and CHUNKS authentication * variables. There are arrays that we encode directly * into parameters to make the rest of the operations easier. */ auth_hmacs = kzalloc(struct_size(auth_hmacs, hmac_ids, SCTP_AUTH_NUM_HMACS), gfp); if (!auth_hmacs) goto nomem; auth_chunks = kzalloc(sizeof(*auth_chunks) + SCTP_NUM_CHUNK_TYPES, gfp); if (!auth_chunks) goto nomem; /* Initialize the HMACS parameter. * SCTP-AUTH: Section 3.3 * Every endpoint supporting SCTP chunk authentication MUST * support the HMAC based on the SHA-1 algorithm. */ auth_hmacs->param_hdr.type = SCTP_PARAM_HMAC_ALGO; auth_hmacs->param_hdr.length = htons(sizeof(struct sctp_paramhdr) + 2); auth_hmacs->hmac_ids[0] = htons(SCTP_AUTH_HMAC_ID_SHA1); /* Initialize the CHUNKS parameter */ auth_chunks->param_hdr.type = SCTP_PARAM_CHUNKS; auth_chunks->param_hdr.length = htons(sizeof(struct sctp_paramhdr)); /* If the Add-IP functionality is enabled, we must * authenticate, ASCONF and ASCONF-ACK chunks */ if (net->sctp.addip_enable) { auth_chunks->chunks[0] = SCTP_CID_ASCONF; auth_chunks->chunks[1] = SCTP_CID_ASCONF_ACK; auth_chunks->param_hdr.length = htons(sizeof(struct sctp_paramhdr) + 2); } } /* Initialize the base structure. */ /* What type of endpoint are we? */ ep->base.type = SCTP_EP_TYPE_SOCKET; /* Initialize the basic object fields. */ refcount_set(&ep->base.refcnt, 1); ep->base.dead = false; /* Create an input queue. */ sctp_inq_init(&ep->base.inqueue); /* Set its top-half handler */ sctp_inq_set_th_handler(&ep->base.inqueue, sctp_endpoint_bh_rcv); /* Initialize the bind addr area */ sctp_bind_addr_init(&ep->base.bind_addr, 0); /* Create the lists of associations. */ INIT_LIST_HEAD(&ep->asocs); /* Use SCTP specific send buffer space queues. */ ep->sndbuf_policy = net->sctp.sndbuf_policy; sk->sk_data_ready = sctp_data_ready; sk->sk_write_space = sctp_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); /* Get the receive buffer policy for this endpoint */ ep->rcvbuf_policy = net->sctp.rcvbuf_policy; /* Initialize the secret key used with cookie. */ get_random_bytes(ep->secret_key, sizeof(ep->secret_key)); /* SCTP-AUTH extensions*/ INIT_LIST_HEAD(&ep->endpoint_shared_keys); null_key = sctp_auth_shkey_create(0, gfp); if (!null_key) goto nomem; list_add(&null_key->key_list, &ep->endpoint_shared_keys); /* Allocate and initialize transorms arrays for supported HMACs. */ err = sctp_auth_init_hmacs(ep, gfp); if (err) goto nomem_hmacs; /* Add the null key to the endpoint shared keys list and * set the hmcas and chunks pointers. */ ep->auth_hmacs_list = auth_hmacs; ep->auth_chunk_list = auth_chunks; ep->prsctp_enable = net->sctp.prsctp_enable; ep->reconf_enable = net->sctp.reconf_enable; /* Remember who we are attached to. */ ep->base.sk = sk; ep->base.net = sock_net(sk); sock_hold(ep->base.sk); return ep; nomem_hmacs: sctp_auth_destroy_keys(&ep->endpoint_shared_keys); nomem: /* Free all allocations */ kfree(auth_hmacs); kfree(auth_chunks); kfree(ep->digest); return NULL; } /* Create a sctp_endpoint with all that boring stuff initialized. * Returns NULL if there isn't enough memory. */ struct sctp_endpoint *sctp_endpoint_new(struct sock *sk, gfp_t gfp) { struct sctp_endpoint *ep; /* Build a local endpoint. */ ep = kzalloc(sizeof(*ep), gfp); if (!ep) goto fail; if (!sctp_endpoint_init(ep, sk, gfp)) goto fail_init; SCTP_DBG_OBJCNT_INC(ep); return ep; fail_init: kfree(ep); fail: return NULL; } /* Add an association to an endpoint. */ void sctp_endpoint_add_asoc(struct sctp_endpoint *ep, struct sctp_association *asoc) { struct sock *sk = ep->base.sk; /* If this is a temporary association, don't bother * since we'll be removing it shortly and don't * want anyone to find it anyway. */ if (asoc->temp) return; /* Now just add it to our list of asocs */ list_add_tail(&asoc->asocs, &ep->asocs); /* Increment the backlog value for a TCP-style listening socket. */ if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) sk->sk_ack_backlog++; } /* Free the endpoint structure. Delay cleanup until * all users have released their reference count on this structure. */ void sctp_endpoint_free(struct sctp_endpoint *ep) { ep->base.dead = true; inet_sk_set_state(ep->base.sk, SCTP_SS_CLOSED); /* Unlink this endpoint, so we can't find it again! */ sctp_unhash_endpoint(ep); sctp_endpoint_put(ep); } /* Final destructor for endpoint. */ static void sctp_endpoint_destroy_rcu(struct rcu_head *head) { struct sctp_endpoint *ep = container_of(head, struct sctp_endpoint, rcu); struct sock *sk = ep->base.sk; sctp_sk(sk)->ep = NULL; sock_put(sk); kfree(ep); SCTP_DBG_OBJCNT_DEC(ep); } static void sctp_endpoint_destroy(struct sctp_endpoint *ep) { struct sock *sk; if (unlikely(!ep->base.dead)) { WARN(1, "Attempt to destroy undead endpoint %p!\n", ep); return; } /* Free the digest buffer */ kfree(ep->digest); /* SCTP-AUTH: Free up AUTH releated data such as shared keys * chunks and hmacs arrays that were allocated */ sctp_auth_destroy_keys(&ep->endpoint_shared_keys); kfree(ep->auth_hmacs_list); kfree(ep->auth_chunk_list); /* AUTH - Free any allocated HMAC transform containers */ sctp_auth_destroy_hmacs(ep->auth_hmacs); /* Cleanup. */ sctp_inq_free(&ep->base.inqueue); sctp_bind_addr_free(&ep->base.bind_addr); memset(ep->secret_key, 0, sizeof(ep->secret_key)); sk = ep->base.sk; /* Remove and free the port */ if (sctp_sk(sk)->bind_hash) sctp_put_port(sk); call_rcu(&ep->rcu, sctp_endpoint_destroy_rcu); } /* Hold a reference to an endpoint. */ int sctp_endpoint_hold(struct sctp_endpoint *ep) { return refcount_inc_not_zero(&ep->base.refcnt); } /* Release a reference to an endpoint and clean up if there are * no more references. */ void sctp_endpoint_put(struct sctp_endpoint *ep) { if (refcount_dec_and_test(&ep->base.refcnt)) sctp_endpoint_destroy(ep); } /* Is this the endpoint we are looking for? */ struct sctp_endpoint *sctp_endpoint_is_match(struct sctp_endpoint *ep, struct net *net, const union sctp_addr *laddr) { struct sctp_endpoint *retval = NULL; if ((htons(ep->base.bind_addr.port) == laddr->v4.sin_port) && net_eq(sock_net(ep->base.sk), net)) { if (sctp_bind_addr_match(&ep->base.bind_addr, laddr, sctp_sk(ep->base.sk))) retval = ep; } return retval; } /* Find the association that goes with this chunk. * We lookup the transport from hashtable at first, then get association * through t->assoc. */ struct sctp_association *sctp_endpoint_lookup_assoc( const struct sctp_endpoint *ep, const union sctp_addr *paddr, struct sctp_transport **transport) { struct sctp_association *asoc = NULL; struct sctp_transport *t; *transport = NULL; /* If the local port is not set, there can't be any associations * on this endpoint. */ if (!ep->base.bind_addr.port) return NULL; rcu_read_lock(); t = sctp_epaddr_lookup_transport(ep, paddr); if (!t) goto out; *transport = t; asoc = t->asoc; out: rcu_read_unlock(); return asoc; } /* Look for any peeled off association from the endpoint that matches the * given peer address. */ bool sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep, const union sctp_addr *paddr) { struct sctp_sockaddr_entry *addr; struct sctp_bind_addr *bp; struct net *net = sock_net(ep->base.sk); bp = &ep->base.bind_addr; /* This function is called with the socket lock held, * so the address_list can not change. */ list_for_each_entry(addr, &bp->address_list, list) { if (sctp_has_association(net, &addr->a, paddr)) return true; } return false; } /* Do delayed input processing. This is scheduled by sctp_rcv(). * This may be called on BH or task time. */ static void sctp_endpoint_bh_rcv(struct work_struct *work) { struct sctp_endpoint *ep = container_of(work, struct sctp_endpoint, base.inqueue.immediate); struct sctp_association *asoc; struct sock *sk; struct net *net; struct sctp_transport *transport; struct sctp_chunk *chunk; struct sctp_inq *inqueue; union sctp_subtype subtype; enum sctp_state state; int error = 0; int first_time = 1; /* is this the first time through the loop */ if (ep->base.dead) return; asoc = NULL; inqueue = &ep->base.inqueue; sk = ep->base.sk; net = sock_net(sk); while (NULL != (chunk = sctp_inq_pop(inqueue))) { subtype = SCTP_ST_CHUNK(chunk->chunk_hdr->type); /* If the first chunk in the packet is AUTH, do special * processing specified in Section 6.3 of SCTP-AUTH spec */ if (first_time && (subtype.chunk == SCTP_CID_AUTH)) { struct sctp_chunkhdr *next_hdr; next_hdr = sctp_inq_peek(inqueue); if (!next_hdr) goto normal; /* If the next chunk is COOKIE-ECHO, skip the AUTH * chunk while saving a pointer to it so we can do * Authentication later (during cookie-echo * processing). */ if (next_hdr->type == SCTP_CID_COOKIE_ECHO) { chunk->auth_chunk = skb_clone(chunk->skb, GFP_ATOMIC); chunk->auth = 1; continue; } } normal: /* We might have grown an association since last we * looked, so try again. * * This happens when we've just processed our * COOKIE-ECHO chunk. */ if (NULL == chunk->asoc) { asoc = sctp_endpoint_lookup_assoc(ep, sctp_source(chunk), &transport); chunk->asoc = asoc; chunk->transport = transport; } state = asoc ? asoc->state : SCTP_STATE_CLOSED; if (sctp_auth_recv_cid(subtype.chunk, asoc) && !chunk->auth) continue; /* Remember where the last DATA chunk came from so we * know where to send the SACK. */ if (asoc && sctp_chunk_is_data(chunk)) asoc->peer.last_data_from = chunk->transport; else { SCTP_INC_STATS(sock_net(ep->base.sk), SCTP_MIB_INCTRLCHUNKS); if (asoc) asoc->stats.ictrlchunks++; } if (chunk->transport) chunk->transport->last_time_heard = ktime_get(); error = sctp_do_sm(net, SCTP_EVENT_T_CHUNK, subtype, state, ep, asoc, chunk, GFP_ATOMIC); if (error && chunk) chunk->pdiscard = 1; /* Check to see if the endpoint is freed in response to * the incoming chunk. If so, get out of the while loop. */ if (!sctp_sk(sk)->ep) break; if (first_time) first_time = 0; } } |
3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 | /* * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland * License terms: GNU General Public License (GPL) version 2 */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/string.h> #include <linux/skbuff.h> #include <linux/export.h> #include <net/caif/cfpkt.h> #define PKT_PREFIX 48 #define PKT_POSTFIX 2 #define PKT_LEN_WHEN_EXTENDING 128 #define PKT_ERROR(pkt, errmsg) \ do { \ cfpkt_priv(pkt)->erronous = true; \ skb_reset_tail_pointer(&pkt->skb); \ pr_warn(errmsg); \ } while (0) struct cfpktq { struct sk_buff_head head; atomic_t count; /* Lock protects count updates */ spinlock_t lock; }; /* * net/caif/ is generic and does not * understand SKB, so we do this typecast */ struct cfpkt { struct sk_buff skb; }; /* Private data inside SKB */ struct cfpkt_priv_data { struct dev_info dev_info; bool erronous; }; static inline struct cfpkt_priv_data *cfpkt_priv(struct cfpkt *pkt) { return (struct cfpkt_priv_data *) pkt->skb.cb; } static inline bool is_erronous(struct cfpkt *pkt) { return cfpkt_priv(pkt)->erronous; } static inline struct sk_buff *pkt_to_skb(struct cfpkt *pkt) { return &pkt->skb; } static inline struct cfpkt *skb_to_pkt(struct sk_buff *skb) { return (struct cfpkt *) skb; } struct cfpkt *cfpkt_fromnative(enum caif_direction dir, void *nativepkt) { struct cfpkt *pkt = skb_to_pkt(nativepkt); cfpkt_priv(pkt)->erronous = false; return pkt; } EXPORT_SYMBOL(cfpkt_fromnative); void *cfpkt_tonative(struct cfpkt *pkt) { return (void *) pkt; } EXPORT_SYMBOL(cfpkt_tonative); static struct cfpkt *cfpkt_create_pfx(u16 len, u16 pfx) { struct sk_buff *skb; skb = alloc_skb(len + pfx, GFP_ATOMIC); if (unlikely(skb == NULL)) return NULL; skb_reserve(skb, pfx); return skb_to_pkt(skb); } inline struct cfpkt *cfpkt_create(u16 len) { return cfpkt_create_pfx(len + PKT_POSTFIX, PKT_PREFIX); } void cfpkt_destroy(struct cfpkt *pkt) { struct sk_buff *skb = pkt_to_skb(pkt); kfree_skb(skb); } inline bool cfpkt_more(struct cfpkt *pkt) { struct sk_buff *skb = pkt_to_skb(pkt); return skb->len > 0; } int cfpkt_peek_head(struct cfpkt *pkt, void *data, u16 len) { struct sk_buff *skb = pkt_to_skb(pkt); if (skb_headlen(skb) >= len) { memcpy(data, skb->data, len); return 0; } return !cfpkt_extr_head(pkt, data, len) && !cfpkt_add_head(pkt, data, len); } int cfpkt_extr_head(struct cfpkt *pkt, void *data, u16 len) { struct sk_buff *skb = pkt_to_skb(pkt); u8 *from; if (unlikely(is_erronous(pkt))) return -EPROTO; if (unlikely(len > skb->len)) { PKT_ERROR(pkt, "read beyond end of packet\n"); return -EPROTO; } if (unlikely(len > skb_headlen(skb))) { if (unlikely(skb_linearize(skb) != 0)) { PKT_ERROR(pkt, "linearize failed\n"); return -EPROTO; } } from = skb_pull(skb, len); from -= len; if (data) memcpy(data, from, len); return 0; } EXPORT_SYMBOL(cfpkt_extr_head); int cfpkt_extr_trail(struct cfpkt *pkt, void *dta, u16 len) { struct sk_buff *skb = pkt_to_skb(pkt); u8 *data = dta; u8 *from; if (unlikely(is_erronous(pkt))) return -EPROTO; if (unlikely(skb_linearize(skb) != 0)) { PKT_ERROR(pkt, "linearize failed\n"); return -EPROTO; } if (unlikely(skb->data + len > skb_tail_pointer(skb))) { PKT_ERROR(pkt, "read beyond end of packet\n"); return -EPROTO; } from = skb_tail_pointer(skb) - len; skb_trim(skb, skb->len - len); memcpy(data, from, len); return 0; } int cfpkt_pad_trail(struct cfpkt *pkt, u16 len) { return cfpkt_add_body(pkt, NULL, len); } int cfpkt_add_body(struct cfpkt *pkt, const void *data, u16 len) { struct sk_buff *skb = pkt_to_skb(pkt); struct sk_buff *lastskb; u8 *to; u16 addlen = 0; if (unlikely(is_erronous(pkt))) return -EPROTO; lastskb = skb; /* Check whether we need to add space at the tail */ if (unlikely(skb_tailroom(skb) < len)) { if (likely(len < PKT_LEN_WHEN_EXTENDING)) addlen = PKT_LEN_WHEN_EXTENDING; else addlen = len; } /* Check whether we need to change the SKB before writing to the tail */ if (unlikely((addlen > 0) || skb_cloned(skb) || skb_shared(skb))) { /* Make sure data is writable */ if (unlikely(skb_cow_data(skb, addlen, &lastskb) < 0)) { PKT_ERROR(pkt, "cow failed\n"); return -EPROTO; } } /* All set to put the last SKB and optionally write data there. */ to = pskb_put(skb, lastskb, len); if (likely(data)) memcpy(to, data, len); return 0; } inline int cfpkt_addbdy(struct cfpkt *pkt, u8 data) { return cfpkt_add_body(pkt, &data, 1); } int cfpkt_add_head(struct cfpkt *pkt, const void *data2, u16 len) { struct sk_buff *skb = pkt_to_skb(pkt); struct sk_buff *lastskb; u8 *to; const u8 *data = data2; int ret; if (unlikely(is_erronous(pkt))) return -EPROTO; if (unlikely(skb_headroom(skb) < len)) { PKT_ERROR(pkt, "no headroom\n"); return -EPROTO; } /* Make sure data is writable */ ret = skb_cow_data(skb, 0, &lastskb); if (unlikely(ret < 0)) { PKT_ERROR(pkt, "cow failed\n"); return ret; } to = skb_push(skb, len); memcpy(to, data, len); return 0; } EXPORT_SYMBOL(cfpkt_add_head); inline int cfpkt_add_trail(struct cfpkt *pkt, const void *data, u16 len) { return cfpkt_add_body(pkt, data, len); } inline u16 cfpkt_getlen(struct cfpkt *pkt) { struct sk_buff *skb = pkt_to_skb(pkt); return skb->len; } int cfpkt_iterate(struct cfpkt *pkt, u16 (*iter_func)(u16, void *, u16), u16 data) { /* * Don't care about the performance hit of linearizing, * Checksum should not be used on high-speed interfaces anyway. */ if (unlikely(is_erronous(pkt))) return -EPROTO; if (unlikely(skb_linearize(&pkt->skb) != 0)) { PKT_ERROR(pkt, "linearize failed\n"); return -EPROTO; } return iter_func(data, pkt->skb.data, cfpkt_getlen(pkt)); } int cfpkt_setlen(struct cfpkt *pkt, u16 len) { struct sk_buff *skb = pkt_to_skb(pkt); if (unlikely(is_erronous(pkt))) return -EPROTO; if (likely(len <= skb->len)) { if (unlikely(skb->data_len)) ___pskb_trim(skb, len); else skb_trim(skb, len); return cfpkt_getlen(pkt); } /* Need to expand SKB */ if (unlikely(!cfpkt_pad_trail(pkt, len - skb->len))) PKT_ERROR(pkt, "skb_pad_trail failed\n"); return cfpkt_getlen(pkt); } struct cfpkt *cfpkt_append(struct cfpkt *dstpkt, struct cfpkt *addpkt, u16 expectlen) { struct sk_buff *dst = pkt_to_skb(dstpkt); struct sk_buff *add = pkt_to_skb(addpkt); u16 addlen = skb_headlen(add); u16 neededtailspace; struct sk_buff *tmp; u16 dstlen; u16 createlen; if (unlikely(is_erronous(dstpkt) || is_erronous(addpkt))) { return dstpkt; } if (expectlen > addlen) neededtailspace = expectlen; else neededtailspace = addlen; if (dst->tail + neededtailspace > dst->end) { /* Create a dumplicate of 'dst' with more tail space */ struct cfpkt *tmppkt; dstlen = skb_headlen(dst); createlen = dstlen + neededtailspace; tmppkt = cfpkt_create(createlen + PKT_PREFIX + PKT_POSTFIX); if (tmppkt == NULL) return NULL; tmp = pkt_to_skb(tmppkt); skb_set_tail_pointer(tmp, dstlen); tmp->len = dstlen; memcpy(tmp->data, dst->data, dstlen); cfpkt_destroy(dstpkt); dst = tmp; } memcpy(skb_tail_pointer(dst), add->data, skb_headlen(add)); cfpkt_destroy(addpkt); dst->tail += addlen; dst->len += addlen; return skb_to_pkt(dst); } struct cfpkt *cfpkt_split(struct cfpkt *pkt, u16 pos) { struct sk_buff *skb2; struct sk_buff *skb = pkt_to_skb(pkt); struct cfpkt *tmppkt; u8 *split = skb->data + pos; u16 len2nd = skb_tail_pointer(skb) - split; if (unlikely(is_erronous(pkt))) return NULL; if (skb->data + pos > skb_tail_pointer(skb)) { PKT_ERROR(pkt, "trying to split beyond end of packet\n"); return NULL; } /* Create a new packet for the second part of the data */ tmppkt = cfpkt_create_pfx(len2nd + PKT_PREFIX + PKT_POSTFIX, PKT_PREFIX); if (tmppkt == NULL) return NULL; skb2 = pkt_to_skb(tmppkt); if (skb2 == NULL) return NULL; /* Reduce the length of the original packet */ skb_set_tail_pointer(skb, pos); skb->len = pos; memcpy(skb2->data, split, len2nd); skb2->tail += len2nd; skb2->len += len2nd; skb2->priority = skb->priority; return skb_to_pkt(skb2); } bool cfpkt_erroneous(struct cfpkt *pkt) { return cfpkt_priv(pkt)->erronous; } struct caif_payload_info *cfpkt_info(struct cfpkt *pkt) { return (struct caif_payload_info *)&pkt_to_skb(pkt)->cb; } EXPORT_SYMBOL(cfpkt_info); void cfpkt_set_prio(struct cfpkt *pkt, int prio) { pkt_to_skb(pkt)->priority = prio; } EXPORT_SYMBOL(cfpkt_set_prio); |
7 6 2 5 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | /* * Copyright (c) 2011 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/netfilter/xt_devgroup.h> #include <linux/netfilter/x_tables.h> MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Xtables: Device group match"); MODULE_ALIAS("ipt_devgroup"); MODULE_ALIAS("ip6t_devgroup"); static bool devgroup_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_devgroup_info *info = par->matchinfo; if (info->flags & XT_DEVGROUP_MATCH_SRC && (((info->src_group ^ xt_in(par)->group) & info->src_mask ? 1 : 0) ^ ((info->flags & XT_DEVGROUP_INVERT_SRC) ? 1 : 0))) return false; if (info->flags & XT_DEVGROUP_MATCH_DST && (((info->dst_group ^ xt_out(par)->group) & info->dst_mask ? 1 : 0) ^ ((info->flags & XT_DEVGROUP_INVERT_DST) ? 1 : 0))) return false; return true; } static int devgroup_mt_checkentry(const struct xt_mtchk_param *par) { const struct xt_devgroup_info *info = par->matchinfo; if (info->flags & ~(XT_DEVGROUP_MATCH_SRC | XT_DEVGROUP_INVERT_SRC | XT_DEVGROUP_MATCH_DST | XT_DEVGROUP_INVERT_DST)) return -EINVAL; if (info->flags & XT_DEVGROUP_MATCH_SRC && par->hook_mask & ~((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD))) return -EINVAL; if (info->flags & XT_DEVGROUP_MATCH_DST && par->hook_mask & ~((1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING))) return -EINVAL; return 0; } static struct xt_match devgroup_mt_reg __read_mostly = { .name = "devgroup", .match = devgroup_mt, .checkentry = devgroup_mt_checkentry, .matchsize = sizeof(struct xt_devgroup_info), .family = NFPROTO_UNSPEC, .me = THIS_MODULE }; static int __init devgroup_mt_init(void) { return xt_register_match(&devgroup_mt_reg); } static void __exit devgroup_mt_exit(void) { xt_unregister_match(&devgroup_mt_reg); } module_init(devgroup_mt_init); module_exit(devgroup_mt_exit); |
723 47 1247 107 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_KHUGEPAGED_H #define _LINUX_KHUGEPAGED_H #include <linux/sched/coredump.h> /* MMF_VM_HUGEPAGE */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern struct attribute_group khugepaged_attr_group; extern int khugepaged_init(void); extern void khugepaged_destroy(void); extern int start_stop_khugepaged(void); extern int __khugepaged_enter(struct mm_struct *mm); extern void __khugepaged_exit(struct mm_struct *mm); extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma, unsigned long vm_flags); extern void khugepaged_min_free_kbytes_update(void); #define khugepaged_enabled() \ (transparent_hugepage_flags & \ ((1<<TRANSPARENT_HUGEPAGE_FLAG) | \ (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG))) #define khugepaged_always() \ (transparent_hugepage_flags & \ (1<<TRANSPARENT_HUGEPAGE_FLAG)) #define khugepaged_req_madv() \ (transparent_hugepage_flags & \ (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)) #define khugepaged_defrag() \ (transparent_hugepage_flags & \ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)) static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) { if (test_bit(MMF_VM_HUGEPAGE, &oldmm->flags)) return __khugepaged_enter(mm); return 0; } static inline void khugepaged_exit(struct mm_struct *mm) { if (test_bit(MMF_VM_HUGEPAGE, &mm->flags)) __khugepaged_exit(mm); } static inline int khugepaged_enter(struct vm_area_struct *vma, unsigned long vm_flags) { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags)) if ((khugepaged_always() || (khugepaged_req_madv() && (vm_flags & VM_HUGEPAGE))) && !(vm_flags & VM_NOHUGEPAGE) && !test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) if (__khugepaged_enter(vma->vm_mm)) return -ENOMEM; return 0; } #else /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) { return 0; } static inline void khugepaged_exit(struct mm_struct *mm) { } static inline int khugepaged_enter(struct vm_area_struct *vma, unsigned long vm_flags) { return 0; } static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma, unsigned long vm_flags) { return 0; } static inline void khugepaged_min_free_kbytes_update(void) { } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* _LINUX_KHUGEPAGED_H */ |
852 861 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 | /* SPDX-License-Identifier: GPL-2.0 */ /* * This is <linux/capability.h> * * Andrew G. Morgan <morgan@kernel.org> * Alexander Kjeldaas <astor@guardian.no> * with help from Aleph1, Roland Buresund and Andrew Main. * * See here for the libcap library ("POSIX draft" compliance): * * ftp://www.kernel.org/pub/linux/libs/security/linux-privs/kernel-2.6/ */ #ifndef _LINUX_CAPABILITY_H #define _LINUX_CAPABILITY_H #include <uapi/linux/capability.h> #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3 #define _KERNEL_CAPABILITY_U32S _LINUX_CAPABILITY_U32S_3 extern int file_caps_enabled; typedef struct kernel_cap_struct { __u32 cap[_KERNEL_CAPABILITY_U32S]; } kernel_cap_t; /* exact same as vfs_cap_data but in cpu endian and always filled completely */ struct cpu_vfs_cap_data { __u32 magic_etc; kernel_cap_t permitted; kernel_cap_t inheritable; }; #define _USER_CAP_HEADER_SIZE (sizeof(struct __user_cap_header_struct)) #define _KERNEL_CAP_T_SIZE (sizeof(kernel_cap_t)) struct file; struct inode; struct dentry; struct task_struct; struct user_namespace; extern const kernel_cap_t __cap_empty_set; extern const kernel_cap_t __cap_init_eff_set; /* * Internal kernel functions only */ #define CAP_FOR_EACH_U32(__capi) \ for (__capi = 0; __capi < _KERNEL_CAPABILITY_U32S; ++__capi) /* * CAP_FS_MASK and CAP_NFSD_MASKS: * * The fs mask is all the privileges that fsuid==0 historically meant. * At one time in the past, that included CAP_MKNOD and CAP_LINUX_IMMUTABLE. * * It has never meant setting security.* and trusted.* xattrs. * * We could also define fsmask as follows: * 1. CAP_FS_MASK is the privilege to bypass all fs-related DAC permissions * 2. The security.* and trusted.* xattrs are fs-related MAC permissions */ # define CAP_FS_MASK_B0 (CAP_TO_MASK(CAP_CHOWN) \ | CAP_TO_MASK(CAP_MKNOD) \ | CAP_TO_MASK(CAP_DAC_OVERRIDE) \ | CAP_TO_MASK(CAP_DAC_READ_SEARCH) \ | CAP_TO_MASK(CAP_FOWNER) \ | CAP_TO_MASK(CAP_FSETID)) # define CAP_FS_MASK_B1 (CAP_TO_MASK(CAP_MAC_OVERRIDE)) #if _KERNEL_CAPABILITY_U32S != 2 # error Fix up hand-coded capability macro initializers #else /* HAND-CODED capability initializers */ #define CAP_LAST_U32 ((_KERNEL_CAPABILITY_U32S) - 1) #define CAP_LAST_U32_VALID_MASK (CAP_TO_MASK(CAP_LAST_CAP + 1) -1) # define CAP_EMPTY_SET ((kernel_cap_t){{ 0, 0 }}) # define CAP_FULL_SET ((kernel_cap_t){{ ~0, CAP_LAST_U32_VALID_MASK }}) # define CAP_FS_SET ((kernel_cap_t){{ CAP_FS_MASK_B0 \ | CAP_TO_MASK(CAP_LINUX_IMMUTABLE), \ CAP_FS_MASK_B1 } }) # define CAP_NFSD_SET ((kernel_cap_t){{ CAP_FS_MASK_B0 \ | CAP_TO_MASK(CAP_SYS_RESOURCE), \ CAP_FS_MASK_B1 } }) #endif /* _KERNEL_CAPABILITY_U32S != 2 */ # define cap_clear(c) do { (c) = __cap_empty_set; } while (0) #define cap_raise(c, flag) ((c).cap[CAP_TO_INDEX(flag)] |= CAP_TO_MASK(flag)) #define cap_lower(c, flag) ((c).cap[CAP_TO_INDEX(flag)] &= ~CAP_TO_MASK(flag)) #define cap_raised(c, flag) ((c).cap[CAP_TO_INDEX(flag)] & CAP_TO_MASK(flag)) #define CAP_BOP_ALL(c, a, b, OP) \ do { \ unsigned __capi; \ CAP_FOR_EACH_U32(__capi) { \ c.cap[__capi] = a.cap[__capi] OP b.cap[__capi]; \ } \ } while (0) #define CAP_UOP_ALL(c, a, OP) \ do { \ unsigned __capi; \ CAP_FOR_EACH_U32(__capi) { \ c.cap[__capi] = OP a.cap[__capi]; \ } \ } while (0) static inline kernel_cap_t cap_combine(const kernel_cap_t a, const kernel_cap_t b) { kernel_cap_t dest; CAP_BOP_ALL(dest, a, b, |); return dest; } static inline kernel_cap_t cap_intersect(const kernel_cap_t a, const kernel_cap_t b) { kernel_cap_t dest; CAP_BOP_ALL(dest, a, b, &); return dest; } static inline kernel_cap_t cap_drop(const kernel_cap_t a, const kernel_cap_t drop) { kernel_cap_t dest; CAP_BOP_ALL(dest, a, drop, &~); return dest; } static inline kernel_cap_t cap_invert(const kernel_cap_t c) { kernel_cap_t dest; CAP_UOP_ALL(dest, c, ~); return dest; } static inline bool cap_isclear(const kernel_cap_t a) { unsigned __capi; CAP_FOR_EACH_U32(__capi) { if (a.cap[__capi] != 0) return false; } return true; } /* * Check if "a" is a subset of "set". * return true if ALL of the capabilities in "a" are also in "set" * cap_issubset(0101, 1111) will return true * return false if ANY of the capabilities in "a" are not in "set" * cap_issubset(1111, 0101) will return false */ static inline bool cap_issubset(const kernel_cap_t a, const kernel_cap_t set) { kernel_cap_t dest; dest = cap_drop(a, set); return cap_isclear(dest); } /* Used to decide between falling back on the old suser() or fsuser(). */ static inline kernel_cap_t cap_drop_fs_set(const kernel_cap_t a) { const kernel_cap_t __cap_fs_set = CAP_FS_SET; return cap_drop(a, __cap_fs_set); } static inline kernel_cap_t cap_raise_fs_set(const kernel_cap_t a, const kernel_cap_t permitted) { const kernel_cap_t __cap_fs_set = CAP_FS_SET; return cap_combine(a, cap_intersect(permitted, __cap_fs_set)); } static inline kernel_cap_t cap_drop_nfsd_set(const kernel_cap_t a) { const kernel_cap_t __cap_fs_set = CAP_NFSD_SET; return cap_drop(a, __cap_fs_set); } static inline kernel_cap_t cap_raise_nfsd_set(const kernel_cap_t a, const kernel_cap_t permitted) { const kernel_cap_t __cap_nfsd_set = CAP_NFSD_SET; return cap_combine(a, cap_intersect(permitted, __cap_nfsd_set)); } #ifdef CONFIG_MULTIUSER extern bool has_capability(struct task_struct *t, int cap); extern bool has_ns_capability(struct task_struct *t, struct user_namespace *ns, int cap); extern bool has_capability_noaudit(struct task_struct *t, int cap); extern bool has_ns_capability_noaudit(struct task_struct *t, struct user_namespace *ns, int cap); extern bool capable(int cap); extern bool ns_capable(struct user_namespace *ns, int cap); extern bool ns_capable_noaudit(struct user_namespace *ns, int cap); #else static inline bool has_capability(struct task_struct *t, int cap) { return true; } static inline bool has_ns_capability(struct task_struct *t, struct user_namespace *ns, int cap) { return true; } static inline bool has_capability_noaudit(struct task_struct *t, int cap) { return true; } static inline bool has_ns_capability_noaudit(struct task_struct *t, struct user_namespace *ns, int cap) { return true; } static inline bool capable(int cap) { return true; } static inline bool ns_capable(struct user_namespace *ns, int cap) { return true; } static inline bool ns_capable_noaudit(struct user_namespace *ns, int cap) { return true; } #endif /* CONFIG_MULTIUSER */ extern bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode); extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap); extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); /* audit system wants to get cap info from files as well */ extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size); #endif /* !_LINUX_CAPABILITY_H */ |
8 6 4 4 1 3 8 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | /* * This module is used to copy security markings from packets * to connections, and restore security markings from connections * back to packets. This would normally be performed in conjunction * with the SECMARK target and state match. * * Based somewhat on CONNMARK: * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com> * by Henrik Nordstrom <hno@marasystems.com> * * (C) 2006,2008 Red Hat, Inc., James Morris <jmorris@redhat.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_CONNSECMARK.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_ecache.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("James Morris <jmorris@redhat.com>"); MODULE_DESCRIPTION("Xtables: target for copying between connection and security mark"); MODULE_ALIAS("ipt_CONNSECMARK"); MODULE_ALIAS("ip6t_CONNSECMARK"); /* * If the packet has a security mark and the connection does not, copy * the security mark from the packet to the connection. */ static void secmark_save(const struct sk_buff *skb) { if (skb->secmark) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; ct = nf_ct_get(skb, &ctinfo); if (ct && !ct->secmark) { ct->secmark = skb->secmark; nf_conntrack_event_cache(IPCT_SECMARK, ct); } } } /* * If packet has no security mark, and the connection does, restore the * security mark from the connection to the packet. */ static void secmark_restore(struct sk_buff *skb) { if (!skb->secmark) { const struct nf_conn *ct; enum ip_conntrack_info ctinfo; ct = nf_ct_get(skb, &ctinfo); if (ct && ct->secmark) skb->secmark = ct->secmark; } } static unsigned int connsecmark_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_connsecmark_target_info *info = par->targinfo; switch (info->mode) { case CONNSECMARK_SAVE: secmark_save(skb); break; case CONNSECMARK_RESTORE: secmark_restore(skb); break; default: BUG(); } return XT_CONTINUE; } static int connsecmark_tg_check(const struct xt_tgchk_param *par) { const struct xt_connsecmark_target_info *info = par->targinfo; int ret; if (strcmp(par->table, "mangle") != 0 && strcmp(par->table, "security") != 0) { pr_info_ratelimited("only valid in \'mangle\' or \'security\' table, not \'%s\'\n", par->table); return -EINVAL; } switch (info->mode) { case CONNSECMARK_SAVE: case CONNSECMARK_RESTORE: break; default: pr_info_ratelimited("invalid mode: %hu\n", info->mode); return -EINVAL; } ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) pr_info_ratelimited("cannot load conntrack support for proto=%u\n", par->family); return ret; } static void connsecmark_tg_destroy(const struct xt_tgdtor_param *par) { nf_ct_netns_put(par->net, par->family); } static struct xt_target connsecmark_tg_reg __read_mostly = { .name = "CONNSECMARK", .revision = 0, .family = NFPROTO_UNSPEC, .checkentry = connsecmark_tg_check, .destroy = connsecmark_tg_destroy, .target = connsecmark_tg, .targetsize = sizeof(struct xt_connsecmark_target_info), .me = THIS_MODULE, }; static int __init connsecmark_tg_init(void) { return xt_register_target(&connsecmark_tg_reg); } static void __exit connsecmark_tg_exit(void) { xt_unregister_target(&connsecmark_tg_reg); } module_init(connsecmark_tg_init); module_exit(connsecmark_tg_exit); |
25 24 24 25 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 | /* * ebt_arp * * Authors: * Bart De Schuymer <bdschuym@pandora.be> * Tim Gardner <timg@tpi.com> * * April, 2002 * */ #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/module.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_bridge/ebtables.h> #include <linux/netfilter_bridge/ebt_arp.h> static bool ebt_arp_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct ebt_arp_info *info = par->matchinfo; const struct arphdr *ah; struct arphdr _arph; ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph); if (ah == NULL) return false; if ((info->bitmask & EBT_ARP_OPCODE) && NF_INVF(info, EBT_ARP_OPCODE, info->opcode != ah->ar_op)) return false; if ((info->bitmask & EBT_ARP_HTYPE) && NF_INVF(info, EBT_ARP_HTYPE, info->htype != ah->ar_hrd)) return false; if ((info->bitmask & EBT_ARP_PTYPE) && NF_INVF(info, EBT_ARP_PTYPE, info->ptype != ah->ar_pro)) return false; if (info->bitmask & (EBT_ARP_SRC_IP | EBT_ARP_DST_IP | EBT_ARP_GRAT)) { const __be32 *sap, *dap; __be32 saddr, daddr; if (ah->ar_pln != sizeof(__be32) || ah->ar_pro != htons(ETH_P_IP)) return false; sap = skb_header_pointer(skb, sizeof(struct arphdr) + ah->ar_hln, sizeof(saddr), &saddr); if (sap == NULL) return false; dap = skb_header_pointer(skb, sizeof(struct arphdr) + 2*ah->ar_hln+sizeof(saddr), sizeof(daddr), &daddr); if (dap == NULL) return false; if ((info->bitmask & EBT_ARP_SRC_IP) && NF_INVF(info, EBT_ARP_SRC_IP, info->saddr != (*sap & info->smsk))) return false; if ((info->bitmask & EBT_ARP_DST_IP) && NF_INVF(info, EBT_ARP_DST_IP, info->daddr != (*dap & info->dmsk))) return false; if ((info->bitmask & EBT_ARP_GRAT) && NF_INVF(info, EBT_ARP_GRAT, *dap != *sap)) return false; } if (info->bitmask & (EBT_ARP_SRC_MAC | EBT_ARP_DST_MAC)) { const unsigned char *mp; unsigned char _mac[ETH_ALEN]; if (ah->ar_hln != ETH_ALEN || ah->ar_hrd != htons(ARPHRD_ETHER)) return false; if (info->bitmask & EBT_ARP_SRC_MAC) { mp = skb_header_pointer(skb, sizeof(struct arphdr), sizeof(_mac), &_mac); if (mp == NULL) return false; if (NF_INVF(info, EBT_ARP_SRC_MAC, !ether_addr_equal_masked(mp, info->smaddr, info->smmsk))) return false; } if (info->bitmask & EBT_ARP_DST_MAC) { mp = skb_header_pointer(skb, sizeof(struct arphdr) + ah->ar_hln + ah->ar_pln, sizeof(_mac), &_mac); if (mp == NULL) return false; if (NF_INVF(info, EBT_ARP_DST_MAC, !ether_addr_equal_masked(mp, info->dmaddr, info->dmmsk))) return false; } } return true; } static int ebt_arp_mt_check(const struct xt_mtchk_param *par) { const struct ebt_arp_info *info = par->matchinfo; const struct ebt_entry *e = par->entryinfo; if ((e->ethproto != htons(ETH_P_ARP) && e->ethproto != htons(ETH_P_RARP)) || e->invflags & EBT_IPROTO) return -EINVAL; if (info->bitmask & ~EBT_ARP_MASK || info->invflags & ~EBT_ARP_MASK) return -EINVAL; return 0; } static struct xt_match ebt_arp_mt_reg __read_mostly = { .name = "arp", .revision = 0, .family = NFPROTO_BRIDGE, .match = ebt_arp_mt, .checkentry = ebt_arp_mt_check, .matchsize = sizeof(struct ebt_arp_info), .me = THIS_MODULE, }; static int __init ebt_arp_init(void) { return xt_register_match(&ebt_arp_mt_reg); } static void __exit ebt_arp_fini(void) { xt_unregister_match(&ebt_arp_mt_reg); } module_init(ebt_arp_init); module_exit(ebt_arp_fini); MODULE_DESCRIPTION("Ebtables: ARP protocol packet match"); MODULE_LICENSE("GPL"); |
18 17 18 18 18 2 1 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 | /* * CBC: Cipher Block Chaining mode * * Copyright (c) 2016 Herbert Xu <herbert@gondor.apana.org.au> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * */ #ifndef _CRYPTO_CBC_H #define _CRYPTO_CBC_H #include <crypto/internal/skcipher.h> #include <linux/string.h> #include <linux/types.h> static inline int crypto_cbc_encrypt_segment( struct skcipher_walk *walk, struct crypto_skcipher *tfm, void (*fn)(struct crypto_skcipher *, const u8 *, u8 *)) { unsigned int bsize = crypto_skcipher_blocksize(tfm); unsigned int nbytes = walk->nbytes; u8 *src = walk->src.virt.addr; u8 *dst = walk->dst.virt.addr; u8 *iv = walk->iv; do { crypto_xor(iv, src, bsize); fn(tfm, iv, dst); memcpy(iv, dst, bsize); src += bsize; dst += bsize; } while ((nbytes -= bsize) >= bsize); return nbytes; } static inline int crypto_cbc_encrypt_inplace( struct skcipher_walk *walk, struct crypto_skcipher *tfm, void (*fn)(struct crypto_skcipher *, const u8 *, u8 *)) { unsigned int bsize = crypto_skcipher_blocksize(tfm); unsigned int nbytes = walk->nbytes; u8 *src = walk->src.virt.addr; u8 *iv = walk->iv; do { crypto_xor(src, iv, bsize); fn(tfm, src, src); iv = src; src += bsize; } while ((nbytes -= bsize) >= bsize); memcpy(walk->iv, iv, bsize); return nbytes; } static inline int crypto_cbc_encrypt_walk(struct skcipher_request *req, void (*fn)(struct crypto_skcipher *, const u8 *, u8 *)) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct skcipher_walk walk; int err; err = skcipher_walk_virt(&walk, req, false); while (walk.nbytes) { if (walk.src.virt.addr == walk.dst.virt.addr) err = crypto_cbc_encrypt_inplace(&walk, tfm, fn); else err = crypto_cbc_encrypt_segment(&walk, tfm, fn); err = skcipher_walk_done(&walk, err); } return err; } static inline int crypto_cbc_decrypt_segment( struct skcipher_walk *walk, struct crypto_skcipher *tfm, void (*fn)(struct crypto_skcipher *, const u8 *, u8 *)) { unsigned int bsize = crypto_skcipher_blocksize(tfm); unsigned int nbytes = walk->nbytes; u8 *src = walk->src.virt.addr; u8 *dst = walk->dst.virt.addr; u8 *iv = walk->iv; do { fn(tfm, src, dst); crypto_xor(dst, iv, bsize); iv = src; src += bsize; dst += bsize; } while ((nbytes -= bsize) >= bsize); memcpy(walk->iv, iv, bsize); return nbytes; } static inline int crypto_cbc_decrypt_inplace( struct skcipher_walk *walk, struct crypto_skcipher *tfm, void (*fn)(struct crypto_skcipher *, const u8 *, u8 *)) { unsigned int bsize = crypto_skcipher_blocksize(tfm); unsigned int nbytes = walk->nbytes; u8 *src = walk->src.virt.addr; u8 last_iv[bsize]; /* Start of the last block. */ src += nbytes - (nbytes & (bsize - 1)) - bsize; memcpy(last_iv, src, bsize); for (;;) { fn(tfm, src, src); if ((nbytes -= bsize) < bsize) break; crypto_xor(src, src - bsize, bsize); src -= bsize; } crypto_xor(src, walk->iv, bsize); memcpy(walk->iv, last_iv, bsize); return nbytes; } static inline int crypto_cbc_decrypt_blocks( struct skcipher_walk *walk, struct crypto_skcipher *tfm, void (*fn)(struct crypto_skcipher *, const u8 *, u8 *)) { if (walk->src.virt.addr == walk->dst.virt.addr) return crypto_cbc_decrypt_inplace(walk, tfm, fn); else return crypto_cbc_decrypt_segment(walk, tfm, fn); } #endif /* _CRYPTO_CBC_H */ |
37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 39 37 37 37 37 37 37 12 4 4 1 4 4 4 4 4 4 4 5 5 5 4 1 5 1 1 5 5 5 5 5 5 5 3 5 2 5 5 2 2 5 1 4 5 5 3 4 4 4 2 4 4 4 4 4 4 4 4 4 4 2 5 2 2 2 2 2 2 1 1 1 4 3 4 2 2 1 2 2 1 2 2 2 2 5 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 2 2 1 1 1 2 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 3 4 2 1 4 2 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 | // SPDX-License-Identifier: GPL-2.0-only /* * vivid-vid-cap.c - video capture support functions. * * Copyright 2014 Cisco Systems, Inc. and/or its affiliates. All rights reserved. */ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/vmalloc.h> #include <linux/videodev2.h> #include <linux/v4l2-dv-timings.h> #include <media/v4l2-common.h> #include <media/v4l2-event.h> #include <media/v4l2-dv-timings.h> #include <media/v4l2-rect.h> #include "vivid-core.h" #include "vivid-vid-common.h" #include "vivid-kthread-cap.h" #include "vivid-vid-cap.h" /* timeperframe: min/max and default */ static const struct v4l2_fract tpf_min = {.numerator = 1, .denominator = FPS_MAX}, tpf_max = {.numerator = FPS_MAX, .denominator = 1}; static const struct vivid_fmt formats_ovl[] = { { .fourcc = V4L2_PIX_FMT_RGB565, /* gggbbbbb rrrrrggg */ .vdownsampling = { 1 }, .bit_depth = { 16 }, .planes = 1, .buffers = 1, }, { .fourcc = V4L2_PIX_FMT_XRGB555, /* gggbbbbb arrrrrgg */ .vdownsampling = { 1 }, .bit_depth = { 16 }, .planes = 1, .buffers = 1, }, { .fourcc = V4L2_PIX_FMT_ARGB555, /* gggbbbbb arrrrrgg */ .vdownsampling = { 1 }, .bit_depth = { 16 }, .planes = 1, .buffers = 1, }, }; /* The number of discrete webcam framesizes */ #define VIVID_WEBCAM_SIZES 5 /* The number of discrete webcam frameintervals */ #define VIVID_WEBCAM_IVALS (VIVID_WEBCAM_SIZES * 2) /* Sizes must be in increasing order */ static const struct v4l2_frmsize_discrete webcam_sizes[VIVID_WEBCAM_SIZES] = { { 320, 180 }, { 640, 360 }, { 1280, 720 }, { 1920, 1080 }, { 3840, 2160 }, }; /* * Intervals must be in increasing order and there must be twice as many * elements in this array as there are in webcam_sizes. */ static const struct v4l2_fract webcam_intervals[VIVID_WEBCAM_IVALS] = { { 1, 1 }, { 1, 2 }, { 1, 4 }, { 1, 5 }, { 1, 10 }, { 1, 15 }, { 1, 25 }, { 1, 30 }, { 1, 50 }, { 1, 60 }, }; static int vid_cap_queue_setup(struct vb2_queue *vq, unsigned *nbuffers, unsigned *nplanes, unsigned sizes[], struct device *alloc_devs[]) { struct vivid_dev *dev = vb2_get_drv_priv(vq); unsigned buffers = tpg_g_buffers(&dev->tpg); unsigned h = dev->fmt_cap_rect.height; unsigned p; if (dev->field_cap == V4L2_FIELD_ALTERNATE) { /* * You cannot use read() with FIELD_ALTERNATE since the field * information (TOP/BOTTOM) cannot be passed back to the user. */ if (vb2_fileio_is_active(vq)) return -EINVAL; } if (dev->queue_setup_error) { /* * Error injection: test what happens if queue_setup() returns * an error. */ dev->queue_setup_error = false; return -EINVAL; } if (*nplanes) { /* * Check if the number of requested planes match * the number of buffers in the current format. You can't mix that. */ if (*nplanes != buffers) return -EINVAL; for (p = 0; p < buffers; p++) { if (sizes[p] < tpg_g_line_width(&dev->tpg, p) * h + dev->fmt_cap->data_offset[p]) return -EINVAL; } } else { for (p = 0; p < buffers; p++) sizes[p] = tpg_g_line_width(&dev->tpg, p) * h + dev->fmt_cap->data_offset[p]; } if (vq->num_buffers + *nbuffers < 2) *nbuffers = 2 - vq->num_buffers; *nplanes = buffers; dprintk(dev, 1, "%s: count=%d\n", __func__, *nbuffers); for (p = 0; p < buffers; p++) dprintk(dev, 1, "%s: size[%u]=%u\n", __func__, p, sizes[p]); return 0; } static int vid_cap_buf_prepare(struct vb2_buffer *vb) { struct vivid_dev *dev = vb2_get_drv_priv(vb->vb2_queue); unsigned long size; unsigned buffers = tpg_g_buffers(&dev->tpg); unsigned p; dprintk(dev, 1, "%s\n", __func__); if (WARN_ON(NULL == dev->fmt_cap)) return -EINVAL; if (dev->buf_prepare_error) { /* * Error injection: test what happens if buf_prepare() returns * an error. */ dev->buf_prepare_error = false; return -EINVAL; } for (p = 0; p < buffers; p++) { size = tpg_g_line_width(&dev->tpg, p) * dev->fmt_cap_rect.height + dev->fmt_cap->data_offset[p]; if (vb2_plane_size(vb, p) < size) { dprintk(dev, 1, "%s data will not fit into plane %u (%lu < %lu)\n", __func__, p, vb2_plane_size(vb, p), size); return -EINVAL; } vb2_set_plane_payload(vb, p, size); vb->planes[p].data_offset = dev->fmt_cap->data_offset[p]; } return 0; } static void vid_cap_buf_finish(struct vb2_buffer *vb) { struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); struct vivid_dev *dev = vb2_get_drv_priv(vb->vb2_queue); struct v4l2_timecode *tc = &vbuf->timecode; unsigned fps = 25; unsigned seq = vbuf->sequence; if (!vivid_is_sdtv_cap(dev)) return; /* * Set the timecode. Rarely used, so it is interesting to * test this. */ vbuf->flags |= V4L2_BUF_FLAG_TIMECODE; if (dev->std_cap & V4L2_STD_525_60) fps = 30; tc->type = (fps == 30) ? V4L2_TC_TYPE_30FPS : V4L2_TC_TYPE_25FPS; tc->flags = 0; tc->frames = seq % fps; tc->seconds = (seq / fps) % 60; tc->minutes = (seq / (60 * fps)) % 60; tc->hours = (seq / (60 * 60 * fps)) % 24; } static void vid_cap_buf_queue(struct vb2_buffer *vb) { struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); struct vivid_dev *dev = vb2_get_drv_priv(vb->vb2_queue); struct vivid_buffer *buf = container_of(vbuf, struct vivid_buffer, vb); dprintk(dev, 1, "%s\n", __func__); spin_lock(&dev->slock); list_add_tail(&buf->list, &dev->vid_cap_active); spin_unlock(&dev->slock); } static int vid_cap_start_streaming(struct vb2_queue *vq, unsigned count) { struct vivid_dev *dev = vb2_get_drv_priv(vq); unsigned i; int err; if (vb2_is_streaming(&dev->vb_vid_out_q)) dev->can_loop_video = vivid_vid_can_loop(dev); dev->vid_cap_seq_count = 0; dprintk(dev, 1, "%s\n", __func__); for (i = 0; i < VIDEO_MAX_FRAME; i++) dev->must_blank[i] = tpg_g_perc_fill(&dev->tpg) < 100; if (dev->start_streaming_error) { dev->start_streaming_error = false; err = -EINVAL; } else { err = vivid_start_generating_vid_cap(dev, &dev->vid_cap_streaming); } if (err) { struct vivid_buffer *buf, *tmp; list_for_each_entry_safe(buf, tmp, &dev->vid_cap_active, list) { list_del(&buf->list); vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_QUEUED); } } return err; } /* abort streaming and wait for last buffer */ static void vid_cap_stop_streaming(struct vb2_queue *vq) { struct vivid_dev *dev = vb2_get_drv_priv(vq); dprintk(dev, 1, "%s\n", __func__); vivid_stop_generating_vid_cap(dev, &dev->vid_cap_streaming); dev->can_loop_video = false; } const struct vb2_ops vivid_vid_cap_qops = { .queue_setup = vid_cap_queue_setup, .buf_prepare = vid_cap_buf_prepare, .buf_finish = vid_cap_buf_finish, .buf_queue = vid_cap_buf_queue, .start_streaming = vid_cap_start_streaming, .stop_streaming = vid_cap_stop_streaming, .wait_prepare = vb2_ops_wait_prepare, .wait_finish = vb2_ops_wait_finish, }; /* * Determine the 'picture' quality based on the current TV frequency: either * COLOR for a good 'signal', GRAY (grayscale picture) for a slightly off * signal or NOISE for no signal. */ void vivid_update_quality(struct vivid_dev *dev) { unsigned freq_modulus; if (dev->loop_video && (vivid_is_svid_cap(dev) || vivid_is_hdmi_cap(dev))) { /* * The 'noise' will only be replaced by the actual video * if the output video matches the input video settings. */ tpg_s_quality(&dev->tpg, TPG_QUAL_NOISE, 0); return; } if (vivid_is_hdmi_cap(dev) && VIVID_INVALID_SIGNAL(dev->dv_timings_signal_mode)) { tpg_s_quality(&dev->tpg, TPG_QUAL_NOISE, 0); return; } if (vivid_is_sdtv_cap(dev) && VIVID_INVALID_SIGNAL(dev->std_signal_mode)) { tpg_s_quality(&dev->tpg, TPG_QUAL_NOISE, 0); return; } if (!vivid_is_tv_cap(dev)) { tpg_s_quality(&dev->tpg, TPG_QUAL_COLOR, 0); return; } /* * There is a fake channel every 6 MHz at 49.25, 55.25, etc. * From +/- 0.25 MHz around the channel there is color, and from * +/- 1 MHz there is grayscale (chroma is lost). * Everywhere else it is just noise. */ freq_modulus = (dev->tv_freq - 676 /* (43.25-1) * 16 */) % (6 * 16); if (freq_modulus > 2 * 16) { tpg_s_quality(&dev->tpg, TPG_QUAL_NOISE, next_pseudo_random32(dev->tv_freq ^ 0x55) & 0x3f); return; } if (freq_modulus < 12 /*0.75 * 16*/ || freq_modulus > 20 /*1.25 * 16*/) tpg_s_quality(&dev->tpg, TPG_QUAL_GRAY, 0); else tpg_s_quality(&dev->tpg, TPG_QUAL_COLOR, 0); } /* * Get the current picture quality and the associated afc value. */ static enum tpg_quality vivid_get_quality(struct vivid_dev *dev, s32 *afc) { unsigned freq_modulus; if (afc) *afc = 0; if (tpg_g_quality(&dev->tpg) == TPG_QUAL_COLOR || tpg_g_quality(&dev->tpg) == TPG_QUAL_NOISE) return tpg_g_quality(&dev->tpg); /* * There is a fake channel every 6 MHz at 49.25, 55.25, etc. * From +/- 0.25 MHz around the channel there is color, and from * +/- 1 MHz there is grayscale (chroma is lost). * Everywhere else it is just gray. */ freq_modulus = (dev->tv_freq - 676 /* (43.25-1) * 16 */) % (6 * 16); if (afc) *afc = freq_modulus - 1 * 16; return TPG_QUAL_GRAY; } enum tpg_video_aspect vivid_get_video_aspect(const struct vivid_dev *dev) { if (vivid_is_sdtv_cap(dev)) return dev->std_aspect_ratio; if (vivid_is_hdmi_cap(dev)) return dev->dv_timings_aspect_ratio; return TPG_VIDEO_ASPECT_IMAGE; } static enum tpg_pixel_aspect vivid_get_pixel_aspect(const struct vivid_dev *dev) { if (vivid_is_sdtv_cap(dev)) return (dev->std_cap & V4L2_STD_525_60) ? TPG_PIXEL_ASPECT_NTSC : TPG_PIXEL_ASPECT_PAL; if (vivid_is_hdmi_cap(dev) && dev->src_rect.width == 720 && dev->src_rect.height <= 576) return dev->src_rect.height == 480 ? TPG_PIXEL_ASPECT_NTSC : TPG_PIXEL_ASPECT_PAL; return TPG_PIXEL_ASPECT_SQUARE; } /* * Called whenever the format has to be reset which can occur when * changing inputs, standard, timings, etc. */ void vivid_update_format_cap(struct vivid_dev *dev, bool keep_controls) { struct v4l2_bt_timings *bt = &dev->dv_timings_cap.bt; unsigned size; u64 pixelclock; switch (dev->input_type[dev->input]) { case WEBCAM: default: dev->src_rect.width = webcam_sizes[dev->webcam_size_idx].width; dev->src_rect.height = webcam_sizes[dev->webcam_size_idx].height; dev->timeperframe_vid_cap = webcam_intervals[dev->webcam_ival_idx]; dev->field_cap = V4L2_FIELD_NONE; tpg_s_rgb_range(&dev->tpg, V4L2_DV_RGB_RANGE_AUTO); break; case TV: case SVID: dev->field_cap = dev->tv_field_cap; dev->src_rect.width = 720; if (dev->std_cap & V4L2_STD_525_60) { dev->src_rect.height = 480; dev->timeperframe_vid_cap = (struct v4l2_fract) { 1001, 30000 }; dev->service_set_cap = V4L2_SLICED_CAPTION_525; } else { dev->src_rect.height = 576; dev->timeperframe_vid_cap = (struct v4l2_fract) { 1000, 25000 }; dev->service_set_cap = V4L2_SLICED_WSS_625 | V4L2_SLICED_TELETEXT_B; } tpg_s_rgb_range(&dev->tpg, V4L2_DV_RGB_RANGE_AUTO); break; case HDMI: dev->src_rect.width = bt->width; dev->src_rect.height = bt->height; size = V4L2_DV_BT_FRAME_WIDTH(bt) * V4L2_DV_BT_FRAME_HEIGHT(bt); if (dev->reduced_fps && can_reduce_fps(bt)) { pixelclock = div_u64(bt->pixelclock * 1000, 1001); bt->flags |= V4L2_DV_FL_REDUCED_FPS; } else { pixelclock = bt->pixelclock; bt->flags &= ~V4L2_DV_FL_REDUCED_FPS; } dev->timeperframe_vid_cap = (struct v4l2_fract) { size / 100, (u32)pixelclock / 100 }; if (bt->interlaced) dev->field_cap = V4L2_FIELD_ALTERNATE; else dev->field_cap = V4L2_FIELD_NONE; /* * We can be called from within s_ctrl, in that case we can't * set/get controls. Luckily we don't need to in that case. */ if (keep_controls || !dev->colorspace) break; if (bt->flags & V4L2_DV_FL_IS_CE_VIDEO) { if (bt->width == 720 && bt->height <= 576) v4l2_ctrl_s_ctrl(dev->colorspace, VIVID_CS_170M); else v4l2_ctrl_s_ctrl(dev->colorspace, VIVID_CS_709); v4l2_ctrl_s_ctrl(dev->real_rgb_range_cap, 1); } else { v4l2_ctrl_s_ctrl(dev->colorspace, VIVID_CS_SRGB); v4l2_ctrl_s_ctrl(dev->real_rgb_range_cap, 0); } tpg_s_rgb_range(&dev->tpg, v4l2_ctrl_g_ctrl(dev->rgb_range_cap)); break; } vfree(dev->bitmap_cap); dev->bitmap_cap = NULL; vivid_update_quality(dev); tpg_reset_source(&dev->tpg, dev->src_rect.width, dev->src_rect.height, dev->field_cap); dev->crop_cap = dev->src_rect; dev->crop_bounds_cap = dev->src_rect; if (dev->bitmap_cap && (dev->compose_cap.width != dev->crop_cap.width || dev->compose_cap.height != dev->crop_cap.height)) { vfree(dev->bitmap_cap); dev->bitmap_cap = NULL; } dev->compose_cap = dev->crop_cap; if (V4L2_FIELD_HAS_T_OR_B(dev->field_cap)) dev->compose_cap.height /= 2; dev->fmt_cap_rect = dev->compose_cap; tpg_s_video_aspect(&dev->tpg, vivid_get_video_aspect(dev)); tpg_s_pixel_aspect(&dev->tpg, vivid_get_pixel_aspect(dev)); tpg_update_mv_step(&dev->tpg); } /* Map the field to something that is valid for the current input */ static enum v4l2_field vivid_field_cap(struct vivid_dev *dev, enum v4l2_field field) { if (vivid_is_sdtv_cap(dev)) { switch (field) { case V4L2_FIELD_INTERLACED_TB: case V4L2_FIELD_INTERLACED_BT: case V4L2_FIELD_SEQ_TB: case V4L2_FIELD_SEQ_BT: case V4L2_FIELD_TOP: case V4L2_FIELD_BOTTOM: case V4L2_FIELD_ALTERNATE: return field; case V4L2_FIELD_INTERLACED: default: return V4L2_FIELD_INTERLACED; } } if (vivid_is_hdmi_cap(dev)) return dev->dv_timings_cap.bt.interlaced ? V4L2_FIELD_ALTERNATE : V4L2_FIELD_NONE; return V4L2_FIELD_NONE; } static unsigned vivid_colorspace_cap(struct vivid_dev *dev) { if (!dev->loop_video || vivid_is_webcam(dev) || vivid_is_tv_cap(dev)) return tpg_g_colorspace(&dev->tpg); return dev->colorspace_out; } static unsigned vivid_xfer_func_cap(struct vivid_dev *dev) { if (!dev->loop_video || vivid_is_webcam(dev) || vivid_is_tv_cap(dev)) return tpg_g_xfer_func(&dev->tpg); return dev->xfer_func_out; } static unsigned vivid_ycbcr_enc_cap(struct vivid_dev *dev) { if (!dev->loop_video || vivid_is_webcam(dev) || vivid_is_tv_cap(dev)) return tpg_g_ycbcr_enc(&dev->tpg); return dev->ycbcr_enc_out; } static unsigned int vivid_hsv_enc_cap(struct vivid_dev *dev) { if (!dev->loop_video || vivid_is_webcam(dev) || vivid_is_tv_cap(dev)) return tpg_g_hsv_enc(&dev->tpg); return dev->hsv_enc_out; } static unsigned vivid_quantization_cap(struct vivid_dev *dev) { if (!dev->loop_video || vivid_is_webcam(dev) || vivid_is_tv_cap(dev)) return tpg_g_quantization(&dev->tpg); return dev->quantization_out; } int vivid_g_fmt_vid_cap(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); struct v4l2_pix_format_mplane *mp = &f->fmt.pix_mp; unsigned p; mp->width = dev->fmt_cap_rect.width; mp->height = dev->fmt_cap_rect.height; mp->field = dev->field_cap; mp->pixelformat = dev->fmt_cap->fourcc; mp->colorspace = vivid_colorspace_cap(dev); mp->xfer_func = vivid_xfer_func_cap(dev); if (dev->fmt_cap->color_enc == TGP_COLOR_ENC_HSV) mp->hsv_enc = vivid_hsv_enc_cap(dev); else mp->ycbcr_enc = vivid_ycbcr_enc_cap(dev); mp->quantization = vivid_quantization_cap(dev); mp->num_planes = dev->fmt_cap->buffers; for (p = 0; p < mp->num_planes; p++) { mp->plane_fmt[p].bytesperline = tpg_g_bytesperline(&dev->tpg, p); mp->plane_fmt[p].sizeimage = tpg_g_line_width(&dev->tpg, p) * mp->height + dev->fmt_cap->data_offset[p]; } return 0; } int vivid_try_fmt_vid_cap(struct file *file, void *priv, struct v4l2_format *f) { struct v4l2_pix_format_mplane *mp = &f->fmt.pix_mp; struct v4l2_plane_pix_format *pfmt = mp->plane_fmt; struct vivid_dev *dev = video_drvdata(file); const struct vivid_fmt *fmt; unsigned bytesperline, max_bpl; unsigned factor = 1; unsigned w, h; unsigned p; fmt = vivid_get_format(dev, mp->pixelformat); if (!fmt) { dprintk(dev, 1, "Fourcc format (0x%08x) unknown.\n", mp->pixelformat); mp->pixelformat = V4L2_PIX_FMT_YUYV; fmt = vivid_get_format(dev, mp->pixelformat); } mp->field = vivid_field_cap(dev, mp->field); if (vivid_is_webcam(dev)) { const struct v4l2_frmsize_discrete *sz = v4l2_find_nearest_size(webcam_sizes, VIVID_WEBCAM_SIZES, width, height, mp->width, mp->height); w = sz->width; h = sz->height; } else if (vivid_is_sdtv_cap(dev)) { w = 720; h = (dev->std_cap & V4L2_STD_525_60) ? 480 : 576; } else { w = dev->src_rect.width; h = dev->src_rect.height; } if (V4L2_FIELD_HAS_T_OR_B(mp->field)) factor = 2; if (vivid_is_webcam(dev) || (!dev->has_scaler_cap && !dev->has_crop_cap && !dev->has_compose_cap)) { mp->width = w; mp->height = h / factor; } else { struct v4l2_rect r = { 0, 0, mp->width, mp->height * factor }; v4l2_rect_set_min_size(&r, &vivid_min_rect); v4l2_rect_set_max_size(&r, &vivid_max_rect); if (dev->has_scaler_cap && !dev->has_compose_cap) { struct v4l2_rect max_r = { 0, 0, MAX_ZOOM * w, MAX_ZOOM * h }; v4l2_rect_set_max_size(&r, &max_r); } else if (!dev->has_scaler_cap && dev->has_crop_cap && !dev->has_compose_cap) { v4l2_rect_set_max_size(&r, &dev->src_rect); } else if (!dev->has_scaler_cap && !dev->has_crop_cap) { v4l2_rect_set_min_size(&r, &dev->src_rect); } mp->width = r.width; mp->height = r.height / factor; } /* This driver supports custom bytesperline values */ mp->num_planes = fmt->buffers; for (p = 0; p < fmt->buffers; p++) { /* Calculate the minimum supported bytesperline value */ bytesperline = (mp->width * fmt->bit_depth[p]) >> 3; /* Calculate the maximum supported bytesperline value */ max_bpl = (MAX_ZOOM * MAX_WIDTH * fmt->bit_depth[p]) >> 3; if (pfmt[p].bytesperline > max_bpl) pfmt[p].bytesperline = max_bpl; if (pfmt[p].bytesperline < bytesperline) pfmt[p].bytesperline = bytesperline; pfmt[p].sizeimage = (pfmt[p].bytesperline * mp->height) / fmt->vdownsampling[p] + fmt->data_offset[p]; memset(pfmt[p].reserved, 0, sizeof(pfmt[p].reserved)); } for (p = fmt->buffers; p < fmt->planes; p++) pfmt[0].sizeimage += (pfmt[0].bytesperline * mp->height * (fmt->bit_depth[p] / fmt->vdownsampling[p])) / (fmt->bit_depth[0] / fmt->vdownsampling[0]); mp->colorspace = vivid_colorspace_cap(dev); if (fmt->color_enc == TGP_COLOR_ENC_HSV) mp->hsv_enc = vivid_hsv_enc_cap(dev); else mp->ycbcr_enc = vivid_ycbcr_enc_cap(dev); mp->xfer_func = vivid_xfer_func_cap(dev); mp->quantization = vivid_quantization_cap(dev); memset(mp->reserved, 0, sizeof(mp->reserved)); return 0; } int vivid_s_fmt_vid_cap(struct file *file, void *priv, struct v4l2_format *f) { struct v4l2_pix_format_mplane *mp = &f->fmt.pix_mp; struct vivid_dev *dev = video_drvdata(file); struct v4l2_rect *crop = &dev->crop_cap; struct v4l2_rect *compose = &dev->compose_cap; struct vb2_queue *q = &dev->vb_vid_cap_q; int ret = vivid_try_fmt_vid_cap(file, priv, f); unsigned factor = 1; unsigned p; unsigned i; if (ret < 0) return ret; if (vb2_is_busy(q)) { dprintk(dev, 1, "%s device busy\n", __func__); return -EBUSY; } if (dev->overlay_cap_owner && dev->fb_cap.fmt.pixelformat != mp->pixelformat) { dprintk(dev, 1, "overlay is active, can't change pixelformat\n"); return -EBUSY; } dev->fmt_cap = vivid_get_format(dev, mp->pixelformat); if (V4L2_FIELD_HAS_T_OR_B(mp->field)) factor = 2; /* Note: the webcam input doesn't support scaling, cropping or composing */ if (!vivid_is_webcam(dev) && (dev->has_scaler_cap || dev->has_crop_cap || dev->has_compose_cap)) { struct v4l2_rect r = { 0, 0, mp->width, mp->height }; if (dev->has_scaler_cap) { if (dev->has_compose_cap) v4l2_rect_map_inside(compose, &r); else *compose = r; if (dev->has_crop_cap && !dev->has_compose_cap) { struct v4l2_rect min_r = { 0, 0, r.width / MAX_ZOOM, factor * r.height / MAX_ZOOM }; struct v4l2_rect max_r = { 0, 0, r.width * MAX_ZOOM, factor * r.height * MAX_ZOOM }; v4l2_rect_set_min_size(crop, &min_r); v4l2_rect_set_max_size(crop, &max_r); v4l2_rect_map_inside(crop, &dev->crop_bounds_cap); } else if (dev->has_crop_cap) { struct v4l2_rect min_r = { 0, 0, compose->width / MAX_ZOOM, factor * compose->height / MAX_ZOOM }; struct v4l2_rect max_r = { 0, 0, compose->width * MAX_ZOOM, factor * compose->height * MAX_ZOOM }; v4l2_rect_set_min_size(crop, &min_r); v4l2_rect_set_max_size(crop, &max_r); v4l2_rect_map_inside(crop, &dev->crop_bounds_cap); } } else if (dev->has_crop_cap && !dev->has_compose_cap) { r.height *= factor; v4l2_rect_set_size_to(crop, &r); v4l2_rect_map_inside(crop, &dev->crop_bounds_cap); r = *crop; r.height /= factor; v4l2_rect_set_size_to(compose, &r); } else if (!dev->has_crop_cap) { v4l2_rect_map_inside(compose, &r); } else { r.height *= factor; v4l2_rect_set_max_size(crop, &r); v4l2_rect_map_inside(crop, &dev->crop_bounds_cap); compose->top *= factor; compose->height *= factor; v4l2_rect_set_size_to(compose, crop); v4l2_rect_map_inside(compose, &r); compose->top /= factor; compose->height /= factor; } } else if (vivid_is_webcam(dev)) { /* Guaranteed to be a match */ for (i = 0; i < ARRAY_SIZE(webcam_sizes); i++) if (webcam_sizes[i].width == mp->width && webcam_sizes[i].height == mp->height) break; dev->webcam_size_idx = i; if (dev->webcam_ival_idx >= 2 * (VIVID_WEBCAM_SIZES - i)) dev->webcam_ival_idx = 2 * (VIVID_WEBCAM_SIZES - i) - 1; vivid_update_format_cap(dev, false); } else { struct v4l2_rect r = { 0, 0, mp->width, mp->height }; v4l2_rect_set_size_to(compose, &r); r.height *= factor; v4l2_rect_set_size_to(crop, &r); } dev->fmt_cap_rect.width = mp->width; dev->fmt_cap_rect.height = mp->height; tpg_s_buf_height(&dev->tpg, mp->height); tpg_s_fourcc(&dev->tpg, dev->fmt_cap->fourcc); for (p = 0; p < tpg_g_buffers(&dev->tpg); p++) tpg_s_bytesperline(&dev->tpg, p, mp->plane_fmt[p].bytesperline); dev->field_cap = mp->field; if (dev->field_cap == V4L2_FIELD_ALTERNATE) tpg_s_field(&dev->tpg, V4L2_FIELD_TOP, true); else tpg_s_field(&dev->tpg, dev->field_cap, false); tpg_s_crop_compose(&dev->tpg, &dev->crop_cap, &dev->compose_cap); if (vivid_is_sdtv_cap(dev)) dev->tv_field_cap = mp->field; tpg_update_mv_step(&dev->tpg); return 0; } int vidioc_g_fmt_vid_cap_mplane(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); if (!dev->multiplanar) return -ENOTTY; return vivid_g_fmt_vid_cap(file, priv, f); } int vidioc_try_fmt_vid_cap_mplane(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); if (!dev->multiplanar) return -ENOTTY; return vivid_try_fmt_vid_cap(file, priv, f); } int vidioc_s_fmt_vid_cap_mplane(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); if (!dev->multiplanar) return -ENOTTY; return vivid_s_fmt_vid_cap(file, priv, f); } int vidioc_g_fmt_vid_cap(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); if (dev->multiplanar) return -ENOTTY; return fmt_sp2mp_func(file, priv, f, vivid_g_fmt_vid_cap); } int vidioc_try_fmt_vid_cap(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); if (dev->multiplanar) return -ENOTTY; return fmt_sp2mp_func(file, priv, f, vivid_try_fmt_vid_cap); } int vidioc_s_fmt_vid_cap(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); if (dev->multiplanar) return -ENOTTY; return fmt_sp2mp_func(file, priv, f, vivid_s_fmt_vid_cap); } int vivid_vid_cap_g_selection(struct file *file, void *priv, struct v4l2_selection *sel) { struct vivid_dev *dev = video_drvdata(file); if (!dev->has_crop_cap && !dev->has_compose_cap) return -ENOTTY; if (sel->type != V4L2_BUF_TYPE_VIDEO_CAPTURE) return -EINVAL; if (vivid_is_webcam(dev)) return -ENODATA; sel->r.left = sel->r.top = 0; switch (sel->target) { case V4L2_SEL_TGT_CROP: if (!dev->has_crop_cap) return -EINVAL; sel->r = dev->crop_cap; break; case V4L2_SEL_TGT_CROP_DEFAULT: case V4L2_SEL_TGT_CROP_BOUNDS: if (!dev->has_crop_cap) return -EINVAL; sel->r = dev->src_rect; break; case V4L2_SEL_TGT_COMPOSE_BOUNDS: if (!dev->has_compose_cap) return -EINVAL; sel->r = vivid_max_rect; break; case V4L2_SEL_TGT_COMPOSE: if (!dev->has_compose_cap) return -EINVAL; sel->r = dev->compose_cap; break; case V4L2_SEL_TGT_COMPOSE_DEFAULT: if (!dev->has_compose_cap) return -EINVAL; sel->r = dev->fmt_cap_rect; break; default: return -EINVAL; } return 0; } int vivid_vid_cap_s_selection(struct file *file, void *fh, struct v4l2_selection *s) { struct vivid_dev *dev = video_drvdata(file); struct v4l2_rect *crop = &dev->crop_cap; struct v4l2_rect *compose = &dev->compose_cap; unsigned orig_compose_w = compose->width; unsigned orig_compose_h = compose->height; unsigned factor = V4L2_FIELD_HAS_T_OR_B(dev->field_cap) ? 2 : 1; int ret; if (!dev->has_crop_cap && !dev->has_compose_cap) return -ENOTTY; if (s->type != V4L2_BUF_TYPE_VIDEO_CAPTURE) return -EINVAL; if (vivid_is_webcam(dev)) return -ENODATA; switch (s->target) { case V4L2_SEL_TGT_CROP: if (!dev->has_crop_cap) return -EINVAL; ret = vivid_vid_adjust_sel(s->flags, &s->r); if (ret) return ret; v4l2_rect_set_min_size(&s->r, &vivid_min_rect); v4l2_rect_set_max_size(&s->r, &dev->src_rect); v4l2_rect_map_inside(&s->r, &dev->crop_bounds_cap); s->r.top /= factor; s->r.height /= factor; if (dev->has_scaler_cap) { struct v4l2_rect fmt = dev->fmt_cap_rect; struct v4l2_rect max_rect = { 0, 0, s->r.width * MAX_ZOOM, s->r.height * MAX_ZOOM }; struct v4l2_rect min_rect = { 0, 0, s->r.width / MAX_ZOOM, s->r.height / MAX_ZOOM }; v4l2_rect_set_min_size(&fmt, &min_rect); if (!dev->has_compose_cap) v4l2_rect_set_max_size(&fmt, &max_rect); if (!v4l2_rect_same_size(&dev->fmt_cap_rect, &fmt) && vb2_is_busy(&dev->vb_vid_cap_q)) return -EBUSY; if (dev->has_compose_cap) { v4l2_rect_set_min_size(compose, &min_rect); v4l2_rect_set_max_size(compose, &max_rect); v4l2_rect_map_inside(compose, &fmt); } dev->fmt_cap_rect = fmt; tpg_s_buf_height(&dev->tpg, fmt.height); } else if (dev->has_compose_cap) { struct v4l2_rect fmt = dev->fmt_cap_rect; v4l2_rect_set_min_size(&fmt, &s->r); if (!v4l2_rect_same_size(&dev->fmt_cap_rect, &fmt) && vb2_is_busy(&dev->vb_vid_cap_q)) return -EBUSY; dev->fmt_cap_rect = fmt; tpg_s_buf_height(&dev->tpg, fmt.height); v4l2_rect_set_size_to(compose, &s->r); v4l2_rect_map_inside(compose, &dev->fmt_cap_rect); } else { if (!v4l2_rect_same_size(&s->r, &dev->fmt_cap_rect) && vb2_is_busy(&dev->vb_vid_cap_q)) return -EBUSY; v4l2_rect_set_size_to(&dev->fmt_cap_rect, &s->r); v4l2_rect_set_size_to(compose, &s->r); v4l2_rect_map_inside(compose, &dev->fmt_cap_rect); tpg_s_buf_height(&dev->tpg, dev->fmt_cap_rect.height); } s->r.top *= factor; s->r.height *= factor; *crop = s->r; break; case V4L2_SEL_TGT_COMPOSE: if (!dev->has_compose_cap) return -EINVAL; ret = vivid_vid_adjust_sel(s->flags, &s->r); if (ret) return ret; v4l2_rect_set_min_size(&s->r, &vivid_min_rect); v4l2_rect_set_max_size(&s->r, &dev->fmt_cap_rect); if (dev->has_scaler_cap) { struct v4l2_rect max_rect = { 0, 0, dev->src_rect.width * MAX_ZOOM, (dev->src_rect.height / factor) * MAX_ZOOM }; v4l2_rect_set_max_size(&s->r, &max_rect); if (dev->has_crop_cap) { struct v4l2_rect min_rect = { 0, 0, s->r.width / MAX_ZOOM, (s->r.height * factor) / MAX_ZOOM }; struct v4l2_rect max_rect = { 0, 0, s->r.width * MAX_ZOOM, (s->r.height * factor) * MAX_ZOOM }; v4l2_rect_set_min_size(crop, &min_rect); v4l2_rect_set_max_size(crop, &max_rect); v4l2_rect_map_inside(crop, &dev->crop_bounds_cap); } } else if (dev->has_crop_cap) { s->r.top *= factor; s->r.height *= factor; v4l2_rect_set_max_size(&s->r, &dev->src_rect); v4l2_rect_set_size_to(crop, &s->r); v4l2_rect_map_inside(crop, &dev->crop_bounds_cap); s->r.top /= factor; s->r.height /= factor; } else { v4l2_rect_set_size_to(&s->r, &dev->src_rect); s->r.height /= factor; } v4l2_rect_map_inside(&s->r, &dev->fmt_cap_rect); *compose = s->r; break; default: return -EINVAL; } if (dev->bitmap_cap && (compose->width != orig_compose_w || compose->height != orig_compose_h)) { vfree(dev->bitmap_cap); dev->bitmap_cap = NULL; } tpg_s_crop_compose(&dev->tpg, crop, compose); return 0; } int vivid_vid_cap_cropcap(struct file *file, void *priv, struct v4l2_cropcap *cap) { struct vivid_dev *dev = video_drvdata(file); if (cap->type != V4L2_BUF_TYPE_VIDEO_CAPTURE) return -EINVAL; switch (vivid_get_pixel_aspect(dev)) { case TPG_PIXEL_ASPECT_NTSC: cap->pixelaspect.numerator = 11; cap->pixelaspect.denominator = 10; break; case TPG_PIXEL_ASPECT_PAL: cap->pixelaspect.numerator = 54; cap->pixelaspect.denominator = 59; break; case TPG_PIXEL_ASPECT_SQUARE: cap->pixelaspect.numerator = 1; cap->pixelaspect.denominator = 1; break; } return 0; } int vidioc_enum_fmt_vid_overlay(struct file *file, void *priv, struct v4l2_fmtdesc *f) { struct vivid_dev *dev = video_drvdata(file); const struct vivid_fmt *fmt; if (dev->multiplanar) return -ENOTTY; if (f->index >= ARRAY_SIZE(formats_ovl)) return -EINVAL; fmt = &formats_ovl[f->index]; f->pixelformat = fmt->fourcc; return 0; } int vidioc_g_fmt_vid_overlay(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); const struct v4l2_rect *compose = &dev->compose_cap; struct v4l2_window *win = &f->fmt.win; unsigned clipcount = win->clipcount; if (dev->multiplanar) return -ENOTTY; win->w.top = dev->overlay_cap_top; win->w.left = dev->overlay_cap_left; win->w.width = compose->width; win->w.height = compose->height; win->field = dev->overlay_cap_field; win->clipcount = dev->clipcount_cap; if (clipcount > dev->clipcount_cap) clipcount = dev->clipcount_cap; if (dev->bitmap_cap == NULL) win->bitmap = NULL; else if (win->bitmap) { if (copy_to_user(win->bitmap, dev->bitmap_cap, ((compose->width + 7) / 8) * compose->height)) return -EFAULT; } if (clipcount && win->clips) { if (copy_to_user(win->clips, dev->clips_cap, clipcount * sizeof(dev->clips_cap[0]))) return -EFAULT; } return 0; } int vidioc_try_fmt_vid_overlay(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); const struct v4l2_rect *compose = &dev->compose_cap; struct v4l2_window *win = &f->fmt.win; int i, j; if (dev->multiplanar) return -ENOTTY; win->w.left = clamp_t(int, win->w.left, -dev->fb_cap.fmt.width, dev->fb_cap.fmt.width); win->w.top = clamp_t(int, win->w.top, -dev->fb_cap.fmt.height, dev->fb_cap.fmt.height); win->w.width = compose->width; win->w.height = compose->height; if (win->field != V4L2_FIELD_BOTTOM && win->field != V4L2_FIELD_TOP) win->field = V4L2_FIELD_ANY; win->chromakey = 0; win->global_alpha = 0; if (win->clipcount && !win->clips) win->clipcount = 0; if (win->clipcount > MAX_CLIPS) win->clipcount = MAX_CLIPS; if (win->clipcount) { if (copy_from_user(dev->try_clips_cap, win->clips, win->clipcount * sizeof(dev->clips_cap[0]))) return -EFAULT; for (i = 0; i < win->clipcount; i++) { struct v4l2_rect *r = &dev->try_clips_cap[i].c; r->top = clamp_t(s32, r->top, 0, dev->fb_cap.fmt.height - 1); r->height = clamp_t(s32, r->height, 1, dev->fb_cap.fmt.height - r->top); r->left = clamp_t(u32, r->left, 0, dev->fb_cap.fmt.width - 1); r->width = clamp_t(u32, r->width, 1, dev->fb_cap.fmt.width - r->left); } /* * Yeah, so sue me, it's an O(n^2) algorithm. But n is a small * number and it's typically a one-time deal. */ for (i = 0; i < win->clipcount - 1; i++) { struct v4l2_rect *r1 = &dev->try_clips_cap[i].c; for (j = i + 1; j < win->clipcount; j++) { struct v4l2_rect *r2 = &dev->try_clips_cap[j].c; if (v4l2_rect_overlap(r1, r2)) return -EINVAL; } } if (copy_to_user(win->clips, dev->try_clips_cap, win->clipcount * sizeof(dev->clips_cap[0]))) return -EFAULT; } return 0; } int vidioc_s_fmt_vid_overlay(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); const struct v4l2_rect *compose = &dev->compose_cap; struct v4l2_window *win = &f->fmt.win; int ret = vidioc_try_fmt_vid_overlay(file, priv, f); unsigned bitmap_size = ((compose->width + 7) / 8) * compose->height; unsigned clips_size = win->clipcount * sizeof(dev->clips_cap[0]); void *new_bitmap = NULL; if (ret) return ret; if (win->bitmap) { new_bitmap = vzalloc(bitmap_size); if (new_bitmap == NULL) return -ENOMEM; if (copy_from_user(new_bitmap, win->bitmap, bitmap_size)) { vfree(new_bitmap); return -EFAULT; } } dev->overlay_cap_top = win->w.top; dev->overlay_cap_left = win->w.left; dev->overlay_cap_field = win->field; vfree(dev->bitmap_cap); dev->bitmap_cap = new_bitmap; dev->clipcount_cap = win->clipcount; if (dev->clipcount_cap) memcpy(dev->clips_cap, dev->try_clips_cap, clips_size); return 0; } int vivid_vid_cap_overlay(struct file *file, void *fh, unsigned i) { struct vivid_dev *dev = video_drvdata(file); if (dev->multiplanar) return -ENOTTY; if (i && dev->fb_vbase_cap == NULL) return -EINVAL; if (i && dev->fb_cap.fmt.pixelformat != dev->fmt_cap->fourcc) { dprintk(dev, 1, "mismatch between overlay and video capture pixelformats\n"); return -EINVAL; } if (dev->overlay_cap_owner && dev->overlay_cap_owner != fh) return -EBUSY; dev->overlay_cap_owner = i ? fh : NULL; return 0; } int vivid_vid_cap_g_fbuf(struct file *file, void *fh, struct v4l2_framebuffer *a) { struct vivid_dev *dev = video_drvdata(file); if (dev->multiplanar) return -ENOTTY; *a = dev->fb_cap; a->capability = V4L2_FBUF_CAP_BITMAP_CLIPPING | V4L2_FBUF_CAP_LIST_CLIPPING; a->flags = V4L2_FBUF_FLAG_PRIMARY; a->fmt.field = V4L2_FIELD_NONE; a->fmt.colorspace = V4L2_COLORSPACE_SRGB; a->fmt.priv = 0; return 0; } int vivid_vid_cap_s_fbuf(struct file *file, void *fh, const struct v4l2_framebuffer *a) { struct vivid_dev *dev = video_drvdata(file); const struct vivid_fmt *fmt; if (dev->multiplanar) return -ENOTTY; if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RAWIO)) return -EPERM; if (dev->overlay_cap_owner) return -EBUSY; if (a->base == NULL) { dev->fb_cap.base = NULL; dev->fb_vbase_cap = NULL; return 0; } if (a->fmt.width < 48 || a->fmt.height < 32) return -EINVAL; fmt = vivid_get_format(dev, a->fmt.pixelformat); if (!fmt || !fmt->can_do_overlay) return -EINVAL; if (a->fmt.bytesperline < (a->fmt.width * fmt->bit_depth[0]) / 8) return -EINVAL; if (a->fmt.bytesperline > a->fmt.sizeimage / a->fmt.height) return -EINVAL; /* * Only support the framebuffer of one of the vivid instances. * Anything else is rejected. */ if (!vivid_validate_fb(a)) return -EINVAL; dev->fb_vbase_cap = phys_to_virt((unsigned long)a->base); dev->fb_cap = *a; dev->overlay_cap_left = clamp_t(int, dev->overlay_cap_left, -dev->fb_cap.fmt.width, dev->fb_cap.fmt.width); dev->overlay_cap_top = clamp_t(int, dev->overlay_cap_top, -dev->fb_cap.fmt.height, dev->fb_cap.fmt.height); return 0; } static const struct v4l2_audio vivid_audio_inputs[] = { { 0, "TV", V4L2_AUDCAP_STEREO }, { 1, "Line-In", V4L2_AUDCAP_STEREO }, }; int vidioc_enum_input(struct file *file, void *priv, struct v4l2_input *inp) { struct vivid_dev *dev = video_drvdata(file); if (inp->index >= dev->num_inputs) return -EINVAL; inp->type = V4L2_INPUT_TYPE_CAMERA; switch (dev->input_type[inp->index]) { case WEBCAM: snprintf(inp->name, sizeof(inp->name), "Webcam %u", dev->input_name_counter[inp->index]); inp->capabilities = 0; break; case TV: snprintf(inp->name, sizeof(inp->name), "TV %u", dev->input_name_counter[inp->index]); inp->type = V4L2_INPUT_TYPE_TUNER; inp->std = V4L2_STD_ALL; if (dev->has_audio_inputs) inp->audioset = (1 << ARRAY_SIZE(vivid_audio_inputs)) - 1; inp->capabilities = V4L2_IN_CAP_STD; break; case SVID: snprintf(inp->name, sizeof(inp->name), "S-Video %u", dev->input_name_counter[inp->index]); inp->std = V4L2_STD_ALL; if (dev->has_audio_inputs) inp->audioset = (1 << ARRAY_SIZE(vivid_audio_inputs)) - 1; inp->capabilities = V4L2_IN_CAP_STD; break; case HDMI: snprintf(inp->name, sizeof(inp->name), "HDMI %u", dev->input_name_counter[inp->index]); inp->capabilities = V4L2_IN_CAP_DV_TIMINGS; if (dev->edid_blocks == 0 || dev->dv_timings_signal_mode == NO_SIGNAL) inp->status |= V4L2_IN_ST_NO_SIGNAL; else if (dev->dv_timings_signal_mode == NO_LOCK || dev->dv_timings_signal_mode == OUT_OF_RANGE) inp->status |= V4L2_IN_ST_NO_H_LOCK; break; } if (dev->sensor_hflip) inp->status |= V4L2_IN_ST_HFLIP; if (dev->sensor_vflip) inp->status |= V4L2_IN_ST_VFLIP; if (dev->input == inp->index && vivid_is_sdtv_cap(dev)) { if (dev->std_signal_mode == NO_SIGNAL) { inp->status |= V4L2_IN_ST_NO_SIGNAL; } else if (dev->std_signal_mode == NO_LOCK) { inp->status |= V4L2_IN_ST_NO_H_LOCK; } else if (vivid_is_tv_cap(dev)) { switch (tpg_g_quality(&dev->tpg)) { case TPG_QUAL_GRAY: inp->status |= V4L2_IN_ST_COLOR_KILL; break; case TPG_QUAL_NOISE: inp->status |= V4L2_IN_ST_NO_H_LOCK; break; default: break; } } } return 0; } int vidioc_g_input(struct file *file, void *priv, unsigned *i) { struct vivid_dev *dev = video_drvdata(file); *i = dev->input; return 0; } int vidioc_s_input(struct file *file, void *priv, unsigned i) { struct vivid_dev *dev = video_drvdata(file); struct v4l2_bt_timings *bt = &dev->dv_timings_cap.bt; unsigned brightness; if (i >= dev->num_inputs) return -EINVAL; if (i == dev->input) return 0; if (vb2_is_busy(&dev->vb_vid_cap_q) || vb2_is_busy(&dev->vb_vbi_cap_q)) return -EBUSY; dev->input = i; dev->vid_cap_dev.tvnorms = 0; if (dev->input_type[i] == TV || dev->input_type[i] == SVID) { dev->tv_audio_input = (dev->input_type[i] == TV) ? 0 : 1; dev->vid_cap_dev.tvnorms = V4L2_STD_ALL; } dev->vbi_cap_dev.tvnorms = dev->vid_cap_dev.tvnorms; vivid_update_format_cap(dev, false); if (dev->colorspace) { switch (dev->input_type[i]) { case WEBCAM: v4l2_ctrl_s_ctrl(dev->colorspace, VIVID_CS_SRGB); break; case TV: case SVID: v4l2_ctrl_s_ctrl(dev->colorspace, VIVID_CS_170M); break; case HDMI: if (bt->flags & V4L2_DV_FL_IS_CE_VIDEO) { if (dev->src_rect.width == 720 && dev->src_rect.height <= 576) v4l2_ctrl_s_ctrl(dev->colorspace, VIVID_CS_170M); else v4l2_ctrl_s_ctrl(dev->colorspace, VIVID_CS_709); } else { v4l2_ctrl_s_ctrl(dev->colorspace, VIVID_CS_SRGB); } break; } } /* * Modify the brightness range depending on the input. * This makes it easy to use vivid to test if applications can * handle control range modifications and is also how this is * typically used in practice as different inputs may be hooked * up to different receivers with different control ranges. */ brightness = 128 * i + dev->input_brightness[i]; v4l2_ctrl_modify_range(dev->brightness, 128 * i, 255 + 128 * i, 1, 128 + 128 * i); v4l2_ctrl_s_ctrl(dev->brightness, brightness); return 0; } int vidioc_enumaudio(struct file *file, void *fh, struct v4l2_audio *vin) { if (vin->index >= ARRAY_SIZE(vivid_audio_inputs)) return -EINVAL; *vin = vivid_audio_inputs[vin->index]; return 0; } int vidioc_g_audio(struct file *file, void *fh, struct v4l2_audio *vin) { struct vivid_dev *dev = video_drvdata(file); if (!vivid_is_sdtv_cap(dev)) return -EINVAL; *vin = vivid_audio_inputs[dev->tv_audio_input]; return 0; } int vidioc_s_audio(struct file *file, void *fh, const struct v4l2_audio *vin) { struct vivid_dev *dev = video_drvdata(file); if (!vivid_is_sdtv_cap(dev)) return -EINVAL; if (vin->index >= ARRAY_SIZE(vivid_audio_inputs)) return -EINVAL; dev->tv_audio_input = vin->index; return 0; } int vivid_video_g_frequency(struct file *file, void *fh, struct v4l2_frequency *vf) { struct vivid_dev *dev = video_drvdata(file); if (vf->tuner != 0) return -EINVAL; vf->frequency = dev->tv_freq; return 0; } int vivid_video_s_frequency(struct file *file, void *fh, const struct v4l2_frequency *vf) { struct vivid_dev *dev = video_drvdata(file); if (vf->tuner != 0) return -EINVAL; dev->tv_freq = clamp_t(unsigned, vf->frequency, MIN_TV_FREQ, MAX_TV_FREQ); if (vivid_is_tv_cap(dev)) vivid_update_quality(dev); return 0; } int vivid_video_s_tuner(struct file *file, void *fh, const struct v4l2_tuner *vt) { struct vivid_dev *dev = video_drvdata(file); if (vt->index != 0) return -EINVAL; if (vt->audmode > V4L2_TUNER_MODE_LANG1_LANG2) return -EINVAL; dev->tv_audmode = vt->audmode; return 0; } int vivid_video_g_tuner(struct file *file, void *fh, struct v4l2_tuner *vt) { struct vivid_dev *dev = video_drvdata(file); enum tpg_quality qual; if (vt->index != 0) return -EINVAL; vt->capability = V4L2_TUNER_CAP_NORM | V4L2_TUNER_CAP_STEREO | V4L2_TUNER_CAP_LANG1 | V4L2_TUNER_CAP_LANG2; vt->audmode = dev->tv_audmode; vt->rangelow = MIN_TV_FREQ; vt->rangehigh = MAX_TV_FREQ; qual = vivid_get_quality(dev, &vt->afc); if (qual == TPG_QUAL_COLOR) vt->signal = 0xffff; else if (qual == TPG_QUAL_GRAY) vt->signal = 0x8000; else vt->signal = 0; if (qual == TPG_QUAL_NOISE) { vt->rxsubchans = 0; } else if (qual == TPG_QUAL_GRAY) { vt->rxsubchans = V4L2_TUNER_SUB_MONO; } else { unsigned channel_nr = dev->tv_freq / (6 * 16); unsigned options = (dev->std_cap & V4L2_STD_NTSC_M) ? 4 : 3; switch (channel_nr % options) { case 0: vt->rxsubchans = V4L2_TUNER_SUB_MONO; break; case 1: vt->rxsubchans = V4L2_TUNER_SUB_STEREO; break; case 2: if (dev->std_cap & V4L2_STD_NTSC_M) vt->rxsubchans = V4L2_TUNER_SUB_MONO | V4L2_TUNER_SUB_SAP; else vt->rxsubchans = V4L2_TUNER_SUB_LANG1 | V4L2_TUNER_SUB_LANG2; break; case 3: vt->rxsubchans = V4L2_TUNER_SUB_STEREO | V4L2_TUNER_SUB_SAP; break; } } strlcpy(vt->name, "TV Tuner", sizeof(vt->name)); return 0; } /* Must remain in sync with the vivid_ctrl_standard_strings array */ const v4l2_std_id vivid_standard[] = { V4L2_STD_NTSC_M, V4L2_STD_NTSC_M_JP, V4L2_STD_NTSC_M_KR, V4L2_STD_NTSC_443, V4L2_STD_PAL_BG | V4L2_STD_PAL_H, V4L2_STD_PAL_I, V4L2_STD_PAL_DK, V4L2_STD_PAL_M, V4L2_STD_PAL_N, V4L2_STD_PAL_Nc, V4L2_STD_PAL_60, V4L2_STD_SECAM_B | V4L2_STD_SECAM_G | V4L2_STD_SECAM_H, V4L2_STD_SECAM_DK, V4L2_STD_SECAM_L, V4L2_STD_SECAM_LC, V4L2_STD_UNKNOWN }; /* Must remain in sync with the vivid_standard array */ const char * const vivid_ctrl_standard_strings[] = { "NTSC-M", "NTSC-M-JP", "NTSC-M-KR", "NTSC-443", "PAL-BGH", "PAL-I", "PAL-DK", "PAL-M", "PAL-N", "PAL-Nc", "PAL-60", "SECAM-BGH", "SECAM-DK", "SECAM-L", "SECAM-Lc", NULL, }; int vidioc_querystd(struct file *file, void *priv, v4l2_std_id *id) { struct vivid_dev *dev = video_drvdata(file); if (!vivid_is_sdtv_cap(dev)) return -ENODATA; if (dev->std_signal_mode == NO_SIGNAL || dev->std_signal_mode == NO_LOCK) { *id = V4L2_STD_UNKNOWN; return 0; } if (vivid_is_tv_cap(dev) && tpg_g_quality(&dev->tpg) == TPG_QUAL_NOISE) { *id = V4L2_STD_UNKNOWN; } else if (dev->std_signal_mode == CURRENT_STD) { *id = dev->std_cap; } else if (dev->std_signal_mode == SELECTED_STD) { *id = dev->query_std; } else { *id = vivid_standard[dev->query_std_last]; dev->query_std_last = (dev->query_std_last + 1) % ARRAY_SIZE(vivid_standard); } return 0; } int vivid_vid_cap_s_std(struct file *file, void *priv, v4l2_std_id id) { struct vivid_dev *dev = video_drvdata(file); if (!vivid_is_sdtv_cap(dev)) return -ENODATA; if (dev->std_cap == id) return 0; if (vb2_is_busy(&dev->vb_vid_cap_q) || vb2_is_busy(&dev->vb_vbi_cap_q)) return -EBUSY; dev->std_cap = id; vivid_update_format_cap(dev, false); return 0; } static void find_aspect_ratio(u32 width, u32 height, u32 *num, u32 *denom) { if (!(height % 3) && ((height * 4 / 3) == width)) { *num = 4; *denom = 3; } else if (!(height % 9) && ((height * 16 / 9) == width)) { *num = 16; *denom = 9; } else if (!(height % 10) && ((height * 16 / 10) == width)) { *num = 16; *denom = 10; } else if (!(height % 4) && ((height * 5 / 4) == width)) { *num = 5; *denom = 4; } else if (!(height % 9) && ((height * 15 / 9) == width)) { *num = 15; *denom = 9; } else { /* default to 16:9 */ *num = 16; *denom = 9; } } static bool valid_cvt_gtf_timings(struct v4l2_dv_timings *timings) { struct v4l2_bt_timings *bt = &timings->bt; u32 total_h_pixel; u32 total_v_lines; u32 h_freq; if (!v4l2_valid_dv_timings(timings, &vivid_dv_timings_cap, NULL, NULL)) return false; total_h_pixel = V4L2_DV_BT_FRAME_WIDTH(bt); total_v_lines = V4L2_DV_BT_FRAME_HEIGHT(bt); h_freq = (u32)bt->pixelclock / total_h_pixel; if (bt->standards == 0 || (bt->standards & V4L2_DV_BT_STD_CVT)) { if (v4l2_detect_cvt(total_v_lines, h_freq, bt->vsync, bt->width, bt->polarities, bt->interlaced, timings)) return true; } if (bt->standards == 0 || (bt->standards & V4L2_DV_BT_STD_GTF)) { struct v4l2_fract aspect_ratio; find_aspect_ratio(bt->width, bt->height, &aspect_ratio.numerator, &aspect_ratio.denominator); if (v4l2_detect_gtf(total_v_lines, h_freq, bt->vsync, bt->polarities, bt->interlaced, aspect_ratio, timings)) return true; } return false; } int vivid_vid_cap_s_dv_timings(struct file *file, void *_fh, struct v4l2_dv_timings *timings) { struct vivid_dev *dev = video_drvdata(file); if (!vivid_is_hdmi_cap(dev)) return -ENODATA; if (!v4l2_find_dv_timings_cap(timings, &vivid_dv_timings_cap, 0, NULL, NULL) && !valid_cvt_gtf_timings(timings)) return -EINVAL; if (v4l2_match_dv_timings(timings, &dev->dv_timings_cap, 0, false)) return 0; if (vb2_is_busy(&dev->vb_vid_cap_q)) return -EBUSY; dev->dv_timings_cap = *timings; vivid_update_format_cap(dev, false); return 0; } int vidioc_query_dv_timings(struct file *file, void *_fh, struct v4l2_dv_timings *timings) { struct vivid_dev *dev = video_drvdata(file); if (!vivid_is_hdmi_cap(dev)) return -ENODATA; if (dev->dv_timings_signal_mode == NO_SIGNAL || dev->edid_blocks == 0) return -ENOLINK; if (dev->dv_timings_signal_mode == NO_LOCK) return -ENOLCK; if (dev->dv_timings_signal_mode == OUT_OF_RANGE) { timings->bt.pixelclock = vivid_dv_timings_cap.bt.max_pixelclock * 2; return -ERANGE; } if (dev->dv_timings_signal_mode == CURRENT_DV_TIMINGS) { *timings = dev->dv_timings_cap; } else if (dev->dv_timings_signal_mode == SELECTED_DV_TIMINGS) { *timings = v4l2_dv_timings_presets[dev->query_dv_timings]; } else { *timings = v4l2_dv_timings_presets[dev->query_dv_timings_last]; dev->query_dv_timings_last = (dev->query_dv_timings_last + 1) % dev->query_dv_timings_size; } return 0; } int vidioc_s_edid(struct file *file, void *_fh, struct v4l2_edid *edid) { struct vivid_dev *dev = video_drvdata(file); u16 phys_addr; unsigned int i; int ret; memset(edid->reserved, 0, sizeof(edid->reserved)); if (edid->pad >= dev->num_inputs) return -EINVAL; if (dev->input_type[edid->pad] != HDMI || edid->start_block) return -EINVAL; if (edid->blocks == 0) { dev->edid_blocks = 0; phys_addr = CEC_PHYS_ADDR_INVALID; goto set_phys_addr; } if (edid->blocks > dev->edid_max_blocks) { edid->blocks = dev->edid_max_blocks; return -E2BIG; } phys_addr = cec_get_edid_phys_addr(edid->edid, edid->blocks * 128, NULL); ret = v4l2_phys_addr_validate(phys_addr, &phys_addr, NULL); if (ret) return ret; if (vb2_is_busy(&dev->vb_vid_cap_q)) return -EBUSY; dev->edid_blocks = edid->blocks; memcpy(dev->edid, edid->edid, edid->blocks * 128); set_phys_addr: /* TODO: a proper hotplug detect cycle should be emulated here */ cec_s_phys_addr(dev->cec_rx_adap, phys_addr, false); for (i = 0; i < MAX_OUTPUTS && dev->cec_tx_adap[i]; i++) cec_s_phys_addr(dev->cec_tx_adap[i], v4l2_phys_addr_for_input(phys_addr, i + 1), false); return 0; } int vidioc_enum_framesizes(struct file *file, void *fh, struct v4l2_frmsizeenum *fsize) { struct vivid_dev *dev = video_drvdata(file); if (!vivid_is_webcam(dev) && !dev->has_scaler_cap) return -EINVAL; if (vivid_get_format(dev, fsize->pixel_format) == NULL) return -EINVAL; if (vivid_is_webcam(dev)) { if (fsize->index >= ARRAY_SIZE(webcam_sizes)) return -EINVAL; fsize->type = V4L2_FRMSIZE_TYPE_DISCRETE; fsize->discrete = webcam_sizes[fsize->index]; return 0; } if (fsize->index) return -EINVAL; fsize->type = V4L2_FRMSIZE_TYPE_STEPWISE; fsize->stepwise.min_width = MIN_WIDTH; fsize->stepwise.max_width = MAX_WIDTH * MAX_ZOOM; fsize->stepwise.step_width = 2; fsize->stepwise.min_height = MIN_HEIGHT; fsize->stepwise.max_height = MAX_HEIGHT * MAX_ZOOM; fsize->stepwise.step_height = 2; return 0; } /* timeperframe is arbitrary and continuous */ int vidioc_enum_frameintervals(struct file *file, void *priv, struct v4l2_frmivalenum *fival) { struct vivid_dev *dev = video_drvdata(file); const struct vivid_fmt *fmt; int i; fmt = vivid_get_format(dev, fival->pixel_format); if (!fmt) return -EINVAL; if (!vivid_is_webcam(dev)) { if (fival->index) return -EINVAL; if (fival->width < MIN_WIDTH || fival->width > MAX_WIDTH * MAX_ZOOM) return -EINVAL; if (fival->height < MIN_HEIGHT || fival->height > MAX_HEIGHT * MAX_ZOOM) return -EINVAL; fival->type = V4L2_FRMIVAL_TYPE_DISCRETE; fival->discrete = dev->timeperframe_vid_cap; return 0; } for (i = 0; i < ARRAY_SIZE(webcam_sizes); i++) if (fival->width == webcam_sizes[i].width && fival->height == webcam_sizes[i].height) break; if (i == ARRAY_SIZE(webcam_sizes)) return -EINVAL; if (fival->index >= 2 * (VIVID_WEBCAM_SIZES - i)) return -EINVAL; fival->type = V4L2_FRMIVAL_TYPE_DISCRETE; fival->discrete = webcam_intervals[fival->index]; return 0; } int vivid_vid_cap_g_parm(struct file *file, void *priv, struct v4l2_streamparm *parm) { struct vivid_dev *dev = video_drvdata(file); if (parm->type != (dev->multiplanar ? V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE : V4L2_BUF_TYPE_VIDEO_CAPTURE)) return -EINVAL; parm->parm.capture.capability = V4L2_CAP_TIMEPERFRAME; parm->parm.capture.timeperframe = dev->timeperframe_vid_cap; parm->parm.capture.readbuffers = 1; return 0; } #define FRACT_CMP(a, OP, b) \ ((u64)(a).numerator * (b).denominator OP (u64)(b).numerator * (a).denominator) int vivid_vid_cap_s_parm(struct file *file, void *priv, struct v4l2_streamparm *parm) { struct vivid_dev *dev = video_drvdata(file); unsigned ival_sz = 2 * (VIVID_WEBCAM_SIZES - dev->webcam_size_idx); struct v4l2_fract tpf; unsigned i; if (parm->type != (dev->multiplanar ? V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE : V4L2_BUF_TYPE_VIDEO_CAPTURE)) return -EINVAL; if (!vivid_is_webcam(dev)) return vivid_vid_cap_g_parm(file, priv, parm); tpf = parm->parm.capture.timeperframe; if (tpf.denominator == 0) tpf = webcam_intervals[ival_sz - 1]; for (i = 0; i < ival_sz; i++) if (FRACT_CMP(tpf, >=, webcam_intervals[i])) break; if (i == ival_sz) i = ival_sz - 1; dev->webcam_ival_idx = i; tpf = webcam_intervals[dev->webcam_ival_idx]; tpf = FRACT_CMP(tpf, <, tpf_min) ? tpf_min : tpf; tpf = FRACT_CMP(tpf, >, tpf_max) ? tpf_max : tpf; /* resync the thread's timings */ dev->cap_seq_resync = true; dev->timeperframe_vid_cap = tpf; parm->parm.capture.capability = V4L2_CAP_TIMEPERFRAME; parm->parm.capture.timeperframe = tpf; parm->parm.capture.readbuffers = 1; return 0; } |
4394 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_CLEANCACHE_H #define _LINUX_CLEANCACHE_H #include <linux/fs.h> #include <linux/exportfs.h> #include <linux/mm.h> #define CLEANCACHE_NO_POOL -1 #define CLEANCACHE_NO_BACKEND -2 #define CLEANCACHE_NO_BACKEND_SHARED -3 #define CLEANCACHE_KEY_MAX 6 /* * cleancache requires every file with a page in cleancache to have a * unique key unless/until the file is removed/truncated. For some * filesystems, the inode number is unique, but for "modern" filesystems * an exportable filehandle is required (see exportfs.h) */ struct cleancache_filekey { union { ino_t ino; __u32 fh[CLEANCACHE_KEY_MAX]; u32 key[CLEANCACHE_KEY_MAX]; } u; }; struct cleancache_ops { int (*init_fs)(size_t); int (*init_shared_fs)(uuid_t *uuid, size_t); int (*get_page)(int, struct cleancache_filekey, pgoff_t, struct page *); void (*put_page)(int, struct cleancache_filekey, pgoff_t, struct page *); void (*invalidate_page)(int, struct cleancache_filekey, pgoff_t); void (*invalidate_inode)(int, struct cleancache_filekey); void (*invalidate_fs)(int); }; extern int cleancache_register_ops(const struct cleancache_ops *ops); extern void __cleancache_init_fs(struct super_block *); extern void __cleancache_init_shared_fs(struct super_block *); extern int __cleancache_get_page(struct page *); extern void __cleancache_put_page(struct page *); extern void __cleancache_invalidate_page(struct address_space *, struct page *); extern void __cleancache_invalidate_inode(struct address_space *); extern void __cleancache_invalidate_fs(struct super_block *); #ifdef CONFIG_CLEANCACHE #define cleancache_enabled (1) static inline bool cleancache_fs_enabled_mapping(struct address_space *mapping) { return mapping->host->i_sb->cleancache_poolid >= 0; } static inline bool cleancache_fs_enabled(struct page *page) { return cleancache_fs_enabled_mapping(page->mapping); } #else #define cleancache_enabled (0) #define cleancache_fs_enabled(_page) (0) #define cleancache_fs_enabled_mapping(_page) (0) #endif /* * The shim layer provided by these inline functions allows the compiler * to reduce all cleancache hooks to nothingness if CONFIG_CLEANCACHE * is disabled, to a single global variable check if CONFIG_CLEANCACHE * is enabled but no cleancache "backend" has dynamically enabled it, * and, for the most frequent cleancache ops, to a single global variable * check plus a superblock element comparison if CONFIG_CLEANCACHE is enabled * and a cleancache backend has dynamically enabled cleancache, but the * filesystem referenced by that cleancache op has not enabled cleancache. * As a result, CONFIG_CLEANCACHE can be enabled by default with essentially * no measurable performance impact. */ static inline void cleancache_init_fs(struct super_block *sb) { if (cleancache_enabled) __cleancache_init_fs(sb); } static inline void cleancache_init_shared_fs(struct super_block *sb) { if (cleancache_enabled) __cleancache_init_shared_fs(sb); } static inline int cleancache_get_page(struct page *page) { if (cleancache_enabled && cleancache_fs_enabled(page)) return __cleancache_get_page(page); return -1; } static inline void cleancache_put_page(struct page *page) { if (cleancache_enabled && cleancache_fs_enabled(page)) __cleancache_put_page(page); } static inline void cleancache_invalidate_page(struct address_space *mapping, struct page *page) { /* careful... page->mapping is NULL sometimes when this is called */ if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping)) __cleancache_invalidate_page(mapping, page); } static inline void cleancache_invalidate_inode(struct address_space *mapping) { if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping)) __cleancache_invalidate_inode(mapping); } static inline void cleancache_invalidate_fs(struct super_block *sb) { if (cleancache_enabled) __cleancache_invalidate_fs(sb); } #endif /* _LINUX_CLEANCACHE_H */ |
19 18 18 19 36 34 34 33 33 32 32 32 32 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | /* * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2010 * Phillip Lougher <phillip@squashfs.org.uk> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2, * or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * * xattr_id.c */ /* * This file implements code to map the 32-bit xattr id stored in the inode * into the on disk location of the xattr data. */ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/slab.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs.h" #include "xattr.h" /* * Map xattr id using the xattr id look up table */ int squashfs_xattr_lookup(struct super_block *sb, unsigned int index, int *count, unsigned int *size, unsigned long long *xattr) { struct squashfs_sb_info *msblk = sb->s_fs_info; int block = SQUASHFS_XATTR_BLOCK(index); int offset = SQUASHFS_XATTR_BLOCK_OFFSET(index); u64 start_block; struct squashfs_xattr_id id; int err; if (index >= msblk->xattr_ids) return -EINVAL; start_block = le64_to_cpu(msblk->xattr_id_table[block]); err = squashfs_read_metadata(sb, &id, &start_block, &offset, sizeof(id)); if (err < 0) return err; *xattr = le64_to_cpu(id.xattr); *size = le32_to_cpu(id.size); *count = le32_to_cpu(id.count); return 0; } /* * Read uncompressed xattr id lookup table indexes from disk into memory */ __le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 table_start, u64 *xattr_table_start, int *xattr_ids) { struct squashfs_sb_info *msblk = sb->s_fs_info; unsigned int len, indexes; struct squashfs_xattr_id_table *id_table; __le64 *table; u64 start, end; int n; id_table = squashfs_read_table(sb, table_start, sizeof(*id_table)); if (IS_ERR(id_table)) return (__le64 *) id_table; *xattr_table_start = le64_to_cpu(id_table->xattr_table_start); *xattr_ids = le32_to_cpu(id_table->xattr_ids); kfree(id_table); /* Sanity check values */ /* there is always at least one xattr id */ if (*xattr_ids == 0) return ERR_PTR(-EINVAL); len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids); indexes = SQUASHFS_XATTR_BLOCKS(*xattr_ids); /* * The computed size of the index table (len bytes) should exactly * match the table start and end points */ start = table_start + sizeof(*id_table); end = msblk->bytes_used; if (len != (end - start)) return ERR_PTR(-EINVAL); table = squashfs_read_table(sb, start, len); if (IS_ERR(table)) return table; /* table[0], table[1], ... table[indexes - 1] store the locations * of the compressed xattr id blocks. Each entry should be less than * the next (i.e. table[0] < table[1]), and the difference between them * should be SQUASHFS_METADATA_SIZE or less. table[indexes - 1] * should be less than table_start, and again the difference * shouls be SQUASHFS_METADATA_SIZE or less. * * Finally xattr_table_start should be less than table[0]. */ for (n = 0; n < (indexes - 1); n++) { start = le64_to_cpu(table[n]); end = le64_to_cpu(table[n + 1]); if (start >= end || (end - start) > (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) { kfree(table); return ERR_PTR(-EINVAL); } } start = le64_to_cpu(table[indexes - 1]); if (start >= table_start || (table_start - start) > (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) { kfree(table); return ERR_PTR(-EINVAL); } if (*xattr_table_start >= le64_to_cpu(table[0])) { kfree(table); return ERR_PTR(-EINVAL); } return table; } |
21 21 21 5 2 9 9 9 9 9 6 4 6 6 5 5 6 3 3 4 3 3 5 5 5 5 3 2 1 1 2 2 5 6 2 4 3 2 2 5 5 5 5 5 5 5 4 4 3 1 51 2 2 2 49 2 49 2 49 1 49 3 49 1 49 2 49 3 1 49 1 48 49 2 49 1 51 2 51 6 6 2 1 2 2 6 1 7 7 7 5 4 4 5 5 7 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 | /* * linux/fs/9p/vfs_inode_dotl.c * * This file contains vfs inode ops for the 9P2000.L protocol. * * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to: * Free Software Foundation * 51 Franklin Street, Fifth Floor * Boston, MA 02111-1301 USA * */ #include <linux/module.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/pagemap.h> #include <linux/stat.h> #include <linux/string.h> #include <linux/inet.h> #include <linux/namei.h> #include <linux/idr.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/xattr.h> #include <linux/posix_acl.h> #include <net/9p/9p.h> #include <net/9p/client.h> #include "v9fs.h" #include "v9fs_vfs.h" #include "fid.h" #include "cache.h" #include "xattr.h" #include "acl.h" static int v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, dev_t rdev); /** * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a * new file system object. This checks the S_ISGID to determine the owning * group of the new file system object. */ static kgid_t v9fs_get_fsgid_for_create(struct inode *dir_inode) { BUG_ON(dir_inode == NULL); if (dir_inode->i_mode & S_ISGID) { /* set_gid bit is set.*/ return dir_inode->i_gid; } return current_fsgid(); } static int v9fs_test_inode_dotl(struct inode *inode, void *data) { struct v9fs_inode *v9inode = V9FS_I(inode); struct p9_stat_dotl *st = (struct p9_stat_dotl *)data; /* don't match inode of different type */ if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT)) return 0; if (inode->i_generation != st->st_gen) return 0; /* compare qid details */ if (memcmp(&v9inode->qid.version, &st->qid.version, sizeof(v9inode->qid.version))) return 0; if (v9inode->qid.type != st->qid.type) return 0; if (v9inode->qid.path != st->qid.path) return 0; return 1; } /* Always get a new inode */ static int v9fs_test_new_inode_dotl(struct inode *inode, void *data) { return 0; } static int v9fs_set_inode_dotl(struct inode *inode, void *data) { struct v9fs_inode *v9inode = V9FS_I(inode); struct p9_stat_dotl *st = (struct p9_stat_dotl *)data; memcpy(&v9inode->qid, &st->qid, sizeof(st->qid)); inode->i_generation = st->st_gen; return 0; } static struct inode *v9fs_qid_iget_dotl(struct super_block *sb, struct p9_qid *qid, struct p9_fid *fid, struct p9_stat_dotl *st, int new) { int retval; unsigned long i_ino; struct inode *inode; struct v9fs_session_info *v9ses = sb->s_fs_info; int (*test)(struct inode *, void *); if (new) test = v9fs_test_new_inode_dotl; else test = v9fs_test_inode_dotl; i_ino = v9fs_qid2ino(qid); inode = iget5_locked(sb, i_ino, test, v9fs_set_inode_dotl, st); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) return inode; /* * initialize the inode with the stat info * FIXME!! we may need support for stale inodes * later. */ inode->i_ino = i_ino; retval = v9fs_init_inode(v9ses, inode, st->st_mode, new_decode_dev(st->st_rdev)); if (retval) goto error; v9fs_stat2inode_dotl(st, inode, 0); v9fs_cache_inode_get_cookie(inode); retval = v9fs_get_acl(inode, fid); if (retval) goto error; unlock_new_inode(inode); return inode; error: iget_failed(inode); return ERR_PTR(retval); } struct inode * v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid, struct super_block *sb, int new) { struct p9_stat_dotl *st; struct inode *inode = NULL; st = p9_client_getattr_dotl(fid, P9_STATS_BASIC | P9_STATS_GEN); if (IS_ERR(st)) return ERR_CAST(st); inode = v9fs_qid_iget_dotl(sb, &st->qid, fid, st, new); kfree(st); return inode; } struct dotl_openflag_map { int open_flag; int dotl_flag; }; static int v9fs_mapped_dotl_flags(int flags) { int i; int rflags = 0; struct dotl_openflag_map dotl_oflag_map[] = { { O_CREAT, P9_DOTL_CREATE }, { O_EXCL, P9_DOTL_EXCL }, { O_NOCTTY, P9_DOTL_NOCTTY }, { O_APPEND, P9_DOTL_APPEND }, { O_NONBLOCK, P9_DOTL_NONBLOCK }, { O_DSYNC, P9_DOTL_DSYNC }, { FASYNC, P9_DOTL_FASYNC }, { O_DIRECT, P9_DOTL_DIRECT }, { O_LARGEFILE, P9_DOTL_LARGEFILE }, { O_DIRECTORY, P9_DOTL_DIRECTORY }, { O_NOFOLLOW, P9_DOTL_NOFOLLOW }, { O_NOATIME, P9_DOTL_NOATIME }, { O_CLOEXEC, P9_DOTL_CLOEXEC }, { O_SYNC, P9_DOTL_SYNC}, }; for (i = 0; i < ARRAY_SIZE(dotl_oflag_map); i++) { if (flags & dotl_oflag_map[i].open_flag) rflags |= dotl_oflag_map[i].dotl_flag; } return rflags; } /** * v9fs_open_to_dotl_flags- convert Linux specific open flags to * plan 9 open flag. * @flags: flags to convert */ int v9fs_open_to_dotl_flags(int flags) { int rflags = 0; /* * We have same bits for P9_DOTL_READONLY, P9_DOTL_WRONLY * and P9_DOTL_NOACCESS */ rflags |= flags & O_ACCMODE; rflags |= v9fs_mapped_dotl_flags(flags); return rflags; } /** * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol. * @dir: directory inode that is being created * @dentry: dentry that is being deleted * @omode: create permissions * */ static int v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, bool excl) { return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0); } static int v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, struct file *file, unsigned flags, umode_t omode) { int err = 0; kgid_t gid; umode_t mode; const unsigned char *name = NULL; struct p9_qid qid; struct inode *inode; struct p9_fid *fid = NULL; struct v9fs_inode *v9inode; struct p9_fid *dfid, *ofid, *inode_fid; struct v9fs_session_info *v9ses; struct posix_acl *pacl = NULL, *dacl = NULL; struct dentry *res = NULL; if (d_in_lookup(dentry)) { res = v9fs_vfs_lookup(dir, dentry, 0); if (IS_ERR(res)) return PTR_ERR(res); if (res) dentry = res; } /* Only creates */ if (!(flags & O_CREAT) || d_really_is_positive(dentry)) return finish_no_open(file, res); v9ses = v9fs_inode2v9ses(dir); name = dentry->d_name.name; p9_debug(P9_DEBUG_VFS, "name:%s flags:0x%x mode:0x%hx\n", name, flags, omode); dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); goto out; } /* clone a fid to use for creation */ ofid = clone_fid(dfid); if (IS_ERR(ofid)) { err = PTR_ERR(ofid); p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); goto out; } gid = v9fs_get_fsgid_for_create(dir); mode = omode; /* Update mode based on ACL value */ err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); if (err) { p9_debug(P9_DEBUG_VFS, "Failed to get acl values in creat %d\n", err); goto error; } err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags), mode, gid, &qid); if (err < 0) { p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in creat %d\n", err); goto error; } v9fs_invalidate_inode_attr(dir); /* instantiate inode and assign the unopened fid to the dentry */ fid = p9_client_walk(dfid, 1, &name, 1); if (IS_ERR(fid)) { err = PTR_ERR(fid); p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); fid = NULL; goto error; } inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err); goto error; } /* Now set the ACL based on the default value */ v9fs_set_create_acl(inode, fid, dacl, pacl); v9fs_fid_add(dentry, fid); d_instantiate(dentry, inode); v9inode = V9FS_I(inode); mutex_lock(&v9inode->v_mutex); if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) && !v9inode->writeback_fid && ((flags & O_ACCMODE) != O_RDONLY)) { /* * clone a fid and add it to writeback_fid * we do it during open time instead of * page dirty time via write_begin/page_mkwrite * because we want write after unlink usecase * to work. */ inode_fid = v9fs_writeback_fid(dentry); if (IS_ERR(inode_fid)) { err = PTR_ERR(inode_fid); mutex_unlock(&v9inode->v_mutex); goto err_clunk_old_fid; } v9inode->writeback_fid = (void *) inode_fid; } mutex_unlock(&v9inode->v_mutex); /* Since we are opening a file, assign the open fid to the file */ err = finish_open(file, dentry, generic_file_open); if (err) goto err_clunk_old_fid; file->private_data = ofid; if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) v9fs_cache_inode_set_cookie(inode, file); file->f_mode |= FMODE_CREATED; out: v9fs_put_acl(dacl, pacl); dput(res); return err; error: if (fid) p9_client_clunk(fid); err_clunk_old_fid: if (ofid) p9_client_clunk(ofid); goto out; } /** * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory * @dir: inode that is being unlinked * @dentry: dentry that is being unlinked * @omode: mode for new directory * */ static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct dentry *dentry, umode_t omode) { int err; struct v9fs_session_info *v9ses; struct p9_fid *fid = NULL, *dfid = NULL; kgid_t gid; const unsigned char *name; umode_t mode; struct inode *inode; struct p9_qid qid; struct posix_acl *dacl = NULL, *pacl = NULL; p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry); err = 0; v9ses = v9fs_inode2v9ses(dir); omode |= S_IFDIR; if (dir->i_mode & S_ISGID) omode |= S_ISGID; dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); dfid = NULL; goto error; } gid = v9fs_get_fsgid_for_create(dir); mode = omode; /* Update mode based on ACL value */ err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); if (err) { p9_debug(P9_DEBUG_VFS, "Failed to get acl values in mkdir %d\n", err); goto error; } name = dentry->d_name.name; err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid); if (err < 0) goto error; fid = p9_client_walk(dfid, 1, &name, 1); if (IS_ERR(fid)) { err = PTR_ERR(fid); p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); fid = NULL; goto error; } /* instantiate inode and assign the unopened fid to the dentry */ if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err); goto error; } v9fs_fid_add(dentry, fid); v9fs_set_create_acl(inode, fid, dacl, pacl); d_instantiate(dentry, inode); fid = NULL; err = 0; } else { /* * Not in cached mode. No need to populate * inode with stat. We need to get an inode * so that we can set the acl with dentry */ inode = v9fs_get_inode(dir->i_sb, mode, 0); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto error; } v9fs_set_create_acl(inode, fid, dacl, pacl); d_instantiate(dentry, inode); } inc_nlink(dir); v9fs_invalidate_inode_attr(dir); error: if (fid) p9_client_clunk(fid); v9fs_put_acl(dacl, pacl); return err; } static int v9fs_vfs_getattr_dotl(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; struct v9fs_session_info *v9ses; struct p9_fid *fid; struct p9_stat_dotl *st; p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { generic_fillattr(d_inode(dentry), stat); return 0; } fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) return PTR_ERR(fid); /* Ask for all the fields in stat structure. Server will return * whatever it supports */ st = p9_client_getattr_dotl(fid, P9_STATS_ALL); if (IS_ERR(st)) return PTR_ERR(st); v9fs_stat2inode_dotl(st, d_inode(dentry), 0); generic_fillattr(d_inode(dentry), stat); /* Change block size to what the server returned */ stat->blksize = st->st_blksize; kfree(st); return 0; } /* * Attribute flags. */ #define P9_ATTR_MODE (1 << 0) #define P9_ATTR_UID (1 << 1) #define P9_ATTR_GID (1 << 2) #define P9_ATTR_SIZE (1 << 3) #define P9_ATTR_ATIME (1 << 4) #define P9_ATTR_MTIME (1 << 5) #define P9_ATTR_CTIME (1 << 6) #define P9_ATTR_ATIME_SET (1 << 7) #define P9_ATTR_MTIME_SET (1 << 8) struct dotl_iattr_map { int iattr_valid; int p9_iattr_valid; }; static int v9fs_mapped_iattr_valid(int iattr_valid) { int i; int p9_iattr_valid = 0; struct dotl_iattr_map dotl_iattr_map[] = { { ATTR_MODE, P9_ATTR_MODE }, { ATTR_UID, P9_ATTR_UID }, { ATTR_GID, P9_ATTR_GID }, { ATTR_SIZE, P9_ATTR_SIZE }, { ATTR_ATIME, P9_ATTR_ATIME }, { ATTR_MTIME, P9_ATTR_MTIME }, { ATTR_CTIME, P9_ATTR_CTIME }, { ATTR_ATIME_SET, P9_ATTR_ATIME_SET }, { ATTR_MTIME_SET, P9_ATTR_MTIME_SET }, }; for (i = 0; i < ARRAY_SIZE(dotl_iattr_map); i++) { if (iattr_valid & dotl_iattr_map[i].iattr_valid) p9_iattr_valid |= dotl_iattr_map[i].p9_iattr_valid; } return p9_iattr_valid; } /** * v9fs_vfs_setattr_dotl - set file metadata * @dentry: file whose metadata to set * @iattr: metadata assignment structure * */ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) { int retval; struct p9_fid *fid; struct p9_iattr_dotl p9attr; struct inode *inode = d_inode(dentry); p9_debug(P9_DEBUG_VFS, "\n"); retval = setattr_prepare(dentry, iattr); if (retval) return retval; p9attr.valid = v9fs_mapped_iattr_valid(iattr->ia_valid); p9attr.mode = iattr->ia_mode; p9attr.uid = iattr->ia_uid; p9attr.gid = iattr->ia_gid; p9attr.size = iattr->ia_size; p9attr.atime_sec = iattr->ia_atime.tv_sec; p9attr.atime_nsec = iattr->ia_atime.tv_nsec; p9attr.mtime_sec = iattr->ia_mtime.tv_sec; p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec; fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) return PTR_ERR(fid); /* Write all dirty data */ if (S_ISREG(inode->i_mode)) filemap_write_and_wait(inode->i_mapping); retval = p9_client_setattr(fid, &p9attr); if (retval < 0) return retval; if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size != i_size_read(inode)) truncate_setsize(inode, iattr->ia_size); v9fs_invalidate_inode_attr(inode); setattr_copy(inode, iattr); mark_inode_dirty(inode); if (iattr->ia_valid & ATTR_MODE) { /* We also want to update ACL when we update mode bits */ retval = v9fs_acl_chmod(inode, fid); if (retval < 0) return retval; } return 0; } /** * v9fs_stat2inode_dotl - populate an inode structure with stat info * @stat: stat structure * @inode: inode to populate * @flags: ctrl flags (e.g. V9FS_STAT2INODE_KEEP_ISIZE) * */ void v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, unsigned int flags) { umode_t mode; struct v9fs_inode *v9inode = V9FS_I(inode); if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) { inode->i_atime.tv_sec = stat->st_atime_sec; inode->i_atime.tv_nsec = stat->st_atime_nsec; inode->i_mtime.tv_sec = stat->st_mtime_sec; inode->i_mtime.tv_nsec = stat->st_mtime_nsec; inode->i_ctime.tv_sec = stat->st_ctime_sec; inode->i_ctime.tv_nsec = stat->st_ctime_nsec; inode->i_uid = stat->st_uid; inode->i_gid = stat->st_gid; set_nlink(inode, stat->st_nlink); mode = stat->st_mode & S_IALLUGO; mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE)) v9fs_i_size_write(inode, stat->st_size); inode->i_blocks = stat->st_blocks; } else { if (stat->st_result_mask & P9_STATS_ATIME) { inode->i_atime.tv_sec = stat->st_atime_sec; inode->i_atime.tv_nsec = stat->st_atime_nsec; } if (stat->st_result_mask & P9_STATS_MTIME) { inode->i_mtime.tv_sec = stat->st_mtime_sec; inode->i_mtime.tv_nsec = stat->st_mtime_nsec; } if (stat->st_result_mask & P9_STATS_CTIME) { inode->i_ctime.tv_sec = stat->st_ctime_sec; inode->i_ctime.tv_nsec = stat->st_ctime_nsec; } if (stat->st_result_mask & P9_STATS_UID) inode->i_uid = stat->st_uid; if (stat->st_result_mask & P9_STATS_GID) inode->i_gid = stat->st_gid; if (stat->st_result_mask & P9_STATS_NLINK) set_nlink(inode, stat->st_nlink); if (stat->st_result_mask & P9_STATS_MODE) { mode = stat->st_mode & S_IALLUGO; mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; } if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) && stat->st_result_mask & P9_STATS_SIZE) v9fs_i_size_write(inode, stat->st_size); if (stat->st_result_mask & P9_STATS_BLOCKS) inode->i_blocks = stat->st_blocks; } if (stat->st_result_mask & P9_STATS_GEN) inode->i_generation = stat->st_gen; /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION * because the inode structure does not have fields for them. */ v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR; } static int v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, const char *symname) { int err; kgid_t gid; const unsigned char *name; struct p9_qid qid; struct inode *inode; struct p9_fid *dfid; struct p9_fid *fid = NULL; struct v9fs_session_info *v9ses; name = dentry->d_name.name; p9_debug(P9_DEBUG_VFS, "%lu,%s,%s\n", dir->i_ino, name, symname); v9ses = v9fs_inode2v9ses(dir); dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); return err; } gid = v9fs_get_fsgid_for_create(dir); /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */ err = p9_client_symlink(dfid, name, symname, gid, &qid); if (err < 0) { p9_debug(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err); goto error; } v9fs_invalidate_inode_attr(dir); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { /* Now walk from the parent so we can get an unopened fid. */ fid = p9_client_walk(dfid, 1, &name, 1); if (IS_ERR(fid)) { err = PTR_ERR(fid); p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); fid = NULL; goto error; } /* instantiate inode and assign the unopened fid to dentry */ inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err); goto error; } v9fs_fid_add(dentry, fid); d_instantiate(dentry, inode); fid = NULL; err = 0; } else { /* Not in cached mode. No need to populate inode with stat */ inode = v9fs_get_inode(dir->i_sb, S_IFLNK, 0); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto error; } d_instantiate(dentry, inode); } error: if (fid) p9_client_clunk(fid); return err; } /** * v9fs_vfs_link_dotl - create a hardlink for dotl * @old_dentry: dentry for file to link to * @dir: inode destination for new link * @dentry: dentry for link * */ static int v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { int err; struct p9_fid *dfid, *oldfid; struct v9fs_session_info *v9ses; p9_debug(P9_DEBUG_VFS, "dir ino: %lu, old_name: %pd, new_name: %pd\n", dir->i_ino, old_dentry, dentry); v9ses = v9fs_inode2v9ses(dir); dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) return PTR_ERR(dfid); oldfid = v9fs_fid_lookup(old_dentry); if (IS_ERR(oldfid)) return PTR_ERR(oldfid); err = p9_client_link(dfid, oldfid, dentry->d_name.name); if (err < 0) { p9_debug(P9_DEBUG_VFS, "p9_client_link failed %d\n", err); return err; } v9fs_invalidate_inode_attr(dir); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { /* Get the latest stat info from server. */ struct p9_fid *fid; fid = v9fs_fid_lookup(old_dentry); if (IS_ERR(fid)) return PTR_ERR(fid); v9fs_refresh_inode_dotl(fid, d_inode(old_dentry)); } ihold(d_inode(old_dentry)); d_instantiate(dentry, d_inode(old_dentry)); return err; } /** * v9fs_vfs_mknod_dotl - create a special file * @dir: inode destination for new link * @dentry: dentry for file * @omode: mode for creation * @rdev: device associated with special file * */ static int v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, dev_t rdev) { int err; kgid_t gid; const unsigned char *name; umode_t mode; struct v9fs_session_info *v9ses; struct p9_fid *fid = NULL, *dfid = NULL; struct inode *inode; struct p9_qid qid; struct posix_acl *dacl = NULL, *pacl = NULL; p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %hx MAJOR: %u MINOR: %u\n", dir->i_ino, dentry, omode, MAJOR(rdev), MINOR(rdev)); v9ses = v9fs_inode2v9ses(dir); dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); dfid = NULL; goto error; } gid = v9fs_get_fsgid_for_create(dir); mode = omode; /* Update mode based on ACL value */ err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); if (err) { p9_debug(P9_DEBUG_VFS, "Failed to get acl values in mknod %d\n", err); goto error; } name = dentry->d_name.name; err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid); if (err < 0) goto error; v9fs_invalidate_inode_attr(dir); fid = p9_client_walk(dfid, 1, &name, 1); if (IS_ERR(fid)) { err = PTR_ERR(fid); p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); fid = NULL; goto error; } /* instantiate inode and assign the unopened fid to the dentry */ if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err); goto error; } v9fs_set_create_acl(inode, fid, dacl, pacl); v9fs_fid_add(dentry, fid); d_instantiate(dentry, inode); fid = NULL; err = 0; } else { /* * Not in cached mode. No need to populate inode with stat. * socket syscall returns a fd, so we need instantiate */ inode = v9fs_get_inode(dir->i_sb, mode, rdev); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto error; } v9fs_set_create_acl(inode, fid, dacl, pacl); d_instantiate(dentry, inode); } error: if (fid) p9_client_clunk(fid); v9fs_put_acl(dacl, pacl); return err; } /** * v9fs_vfs_get_link_dotl - follow a symlink path * @dentry: dentry for symlink * @inode: inode for symlink * @done: destructor for return value */ static const char * v9fs_vfs_get_link_dotl(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct p9_fid *fid; char *target; int retval; if (!dentry) return ERR_PTR(-ECHILD); p9_debug(P9_DEBUG_VFS, "%pd\n", dentry); fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) return ERR_CAST(fid); retval = p9_client_readlink(fid, &target); if (retval) return ERR_PTR(retval); set_delayed_call(done, kfree_link, target); return target; } int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) { struct p9_stat_dotl *st; struct v9fs_session_info *v9ses; unsigned int flags; v9ses = v9fs_inode2v9ses(inode); st = p9_client_getattr_dotl(fid, P9_STATS_ALL); if (IS_ERR(st)) return PTR_ERR(st); /* * Don't update inode if the file type is different */ if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT)) goto out; /* * We don't want to refresh inode->i_size, * because we may have cached data */ flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ? V9FS_STAT2INODE_KEEP_ISIZE : 0; v9fs_stat2inode_dotl(st, inode, flags); out: kfree(st); return 0; } const struct inode_operations v9fs_dir_inode_operations_dotl = { .create = v9fs_vfs_create_dotl, .atomic_open = v9fs_vfs_atomic_open_dotl, .lookup = v9fs_vfs_lookup, .link = v9fs_vfs_link_dotl, .symlink = v9fs_vfs_symlink_dotl, .unlink = v9fs_vfs_unlink, .mkdir = v9fs_vfs_mkdir_dotl, .rmdir = v9fs_vfs_rmdir, .mknod = v9fs_vfs_mknod_dotl, .rename = v9fs_vfs_rename, .getattr = v9fs_vfs_getattr_dotl, .setattr = v9fs_vfs_setattr_dotl, .listxattr = v9fs_listxattr, .get_acl = v9fs_iop_get_acl, }; const struct inode_operations v9fs_file_inode_operations_dotl = { .getattr = v9fs_vfs_getattr_dotl, .setattr = v9fs_vfs_setattr_dotl, .listxattr = v9fs_listxattr, .get_acl = v9fs_iop_get_acl, }; const struct inode_operations v9fs_symlink_inode_operations_dotl = { .get_link = v9fs_vfs_get_link_dotl, .getattr = v9fs_vfs_getattr_dotl, .setattr = v9fs_vfs_setattr_dotl, .listxattr = v9fs_listxattr, }; |
242 242 242 242 242 242 242 242 905 428 428 15 10 10 5 5 5 5 1 1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 | /* * Pid namespaces * * Authors: * (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc. * (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM * Many thanks to Oleg Nesterov for comments and help * */ #include <linux/pid.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include <linux/syscalls.h> #include <linux/cred.h> #include <linux/err.h> #include <linux/acct.h> #include <linux/slab.h> #include <linux/proc_ns.h> #include <linux/reboot.h> #include <linux/export.h> #include <linux/sched/task.h> #include <linux/sched/signal.h> #include <linux/idr.h> static DEFINE_MUTEX(pid_caches_mutex); static struct kmem_cache *pid_ns_cachep; /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ #define MAX_PID_NS_LEVEL 32 /* Write once array, filled from the beginning. */ static struct kmem_cache *pid_cache[MAX_PID_NS_LEVEL]; /* * creates the kmem cache to allocate pids from. * @level: pid namespace level */ static struct kmem_cache *create_pid_cachep(unsigned int level) { /* Level 0 is init_pid_ns.pid_cachep */ struct kmem_cache **pkc = &pid_cache[level - 1]; struct kmem_cache *kc; char name[4 + 10 + 1]; unsigned int len; kc = READ_ONCE(*pkc); if (kc) return kc; snprintf(name, sizeof(name), "pid_%u", level + 1); len = sizeof(struct pid) + level * sizeof(struct upid); mutex_lock(&pid_caches_mutex); /* Name collision forces to do allocation under mutex. */ if (!*pkc) *pkc = kmem_cache_create(name, len, 0, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, 0); mutex_unlock(&pid_caches_mutex); /* current can fail, but someone else can succeed. */ return READ_ONCE(*pkc); } static void proc_cleanup_work(struct work_struct *work) { struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work); pid_ns_release_proc(ns); } static struct ucounts *inc_pid_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES); } static void dec_pid_namespaces(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_PID_NAMESPACES); } static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns, struct pid_namespace *parent_pid_ns) { struct pid_namespace *ns; unsigned int level = parent_pid_ns->level + 1; struct ucounts *ucounts; int err; err = -EINVAL; if (!in_userns(parent_pid_ns->user_ns, user_ns)) goto out; err = -ENOSPC; if (level > MAX_PID_NS_LEVEL) goto out; ucounts = inc_pid_namespaces(user_ns); if (!ucounts) goto out; err = -ENOMEM; ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); if (ns == NULL) goto out_dec; idr_init(&ns->idr); ns->pid_cachep = create_pid_cachep(level); if (ns->pid_cachep == NULL) goto out_free_idr; err = ns_alloc_inum(&ns->ns); if (err) goto out_free_idr; ns->ns.ops = &pidns_operations; kref_init(&ns->kref); ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; ns->pid_allocated = PIDNS_ADDING; INIT_WORK(&ns->proc_work, proc_cleanup_work); return ns; out_free_idr: idr_destroy(&ns->idr); kmem_cache_free(pid_ns_cachep, ns); out_dec: dec_pid_namespaces(ucounts); out: return ERR_PTR(err); } static void delayed_free_pidns(struct rcu_head *p) { struct pid_namespace *ns = container_of(p, struct pid_namespace, rcu); dec_pid_namespaces(ns->ucounts); put_user_ns(ns->user_ns); kmem_cache_free(pid_ns_cachep, ns); } static void destroy_pid_namespace(struct pid_namespace *ns) { ns_free_inum(&ns->ns); idr_destroy(&ns->idr); call_rcu(&ns->rcu, delayed_free_pidns); } struct pid_namespace *copy_pid_ns(unsigned long flags, struct user_namespace *user_ns, struct pid_namespace *old_ns) { if (!(flags & CLONE_NEWPID)) return get_pid_ns(old_ns); if (task_active_pid_ns(current) != old_ns) return ERR_PTR(-EINVAL); return create_pid_namespace(user_ns, old_ns); } static void free_pid_ns(struct kref *kref) { struct pid_namespace *ns; ns = container_of(kref, struct pid_namespace, kref); destroy_pid_namespace(ns); } void put_pid_ns(struct pid_namespace *ns) { struct pid_namespace *parent; while (ns != &init_pid_ns) { parent = ns->parent; if (!kref_put(&ns->kref, free_pid_ns)) break; ns = parent; } } EXPORT_SYMBOL_GPL(put_pid_ns); void zap_pid_ns_processes(struct pid_namespace *pid_ns) { int nr; int rc; struct task_struct *task, *me = current; int init_pids = thread_group_leader(me) ? 1 : 2; struct pid *pid; /* Don't allow any more processes into the pid namespace */ disable_pid_allocation(pid_ns); /* * Ignore SIGCHLD causing any terminated children to autoreap. * This speeds up the namespace shutdown, plus see the comment * below. */ spin_lock_irq(&me->sighand->siglock); me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN; spin_unlock_irq(&me->sighand->siglock); /* * The last thread in the cgroup-init thread group is terminating. * Find remaining pid_ts in the namespace, signal and wait for them * to exit. * * Note: This signals each threads in the namespace - even those that * belong to the same thread group, To avoid this, we would have * to walk the entire tasklist looking a processes in this * namespace, but that could be unnecessarily expensive if the * pid namespace has just a few processes. Or we need to * maintain a tasklist for each pid namespace. * */ rcu_read_lock(); read_lock(&tasklist_lock); nr = 2; idr_for_each_entry_continue(&pid_ns->idr, pid, nr) { task = pid_task(pid, PIDTYPE_PID); if (task && !__fatal_signal_pending(task)) send_sig_info(SIGKILL, SEND_SIG_FORCED, task); } read_unlock(&tasklist_lock); rcu_read_unlock(); /* * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD. * kernel_wait4() will also block until our children traced from the * parent namespace are detached and become EXIT_DEAD. */ do { clear_thread_flag(TIF_SIGPENDING); rc = kernel_wait4(-1, NULL, __WALL, NULL); } while (rc != -ECHILD); /* * kernel_wait4() above can't reap the EXIT_DEAD children but we do not * really care, we could reparent them to the global init. We could * exit and reap ->child_reaper even if it is not the last thread in * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(), * pid_ns can not go away until proc_kill_sb() drops the reference. * * But this ns can also have other tasks injected by setns()+fork(). * Again, ignoring the user visible semantics we do not really need * to wait until they are all reaped, but they can be reparented to * us and thus we need to ensure that pid->child_reaper stays valid * until they all go away. See free_pid()->wake_up_process(). * * We rely on ignored SIGCHLD, an injected zombie must be autoreaped * if reparented. */ for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (pid_ns->pid_allocated == init_pids) break; schedule(); } __set_current_state(TASK_RUNNING); if (pid_ns->reboot) current->signal->group_exit_code = pid_ns->reboot; acct_exit_ns(pid_ns); return; } #ifdef CONFIG_CHECKPOINT_RESTORE static int pid_ns_ctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct pid_namespace *pid_ns = task_active_pid_ns(current); struct ctl_table tmp = *table; int ret, next; if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; /* * Writing directly to ns' last_pid field is OK, since this field * is volatile in a living namespace anyway and a code writing to * it should synchronize its usage with external means. */ next = idr_get_cursor(&pid_ns->idr) - 1; tmp.data = &next; ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (!ret && write) idr_set_cursor(&pid_ns->idr, next + 1); return ret; } extern int pid_max; static int zero = 0; static struct ctl_table pid_ns_ctl_table[] = { { .procname = "ns_last_pid", .maxlen = sizeof(int), .mode = 0666, /* permissions are checked in the handler */ .proc_handler = pid_ns_ctl_handler, .extra1 = &zero, .extra2 = &pid_max, }, { } }; static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; #endif /* CONFIG_CHECKPOINT_RESTORE */ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) { if (pid_ns == &init_pid_ns) return 0; switch (cmd) { case LINUX_REBOOT_CMD_RESTART2: case LINUX_REBOOT_CMD_RESTART: pid_ns->reboot = SIGHUP; break; case LINUX_REBOOT_CMD_POWER_OFF: case LINUX_REBOOT_CMD_HALT: pid_ns->reboot = SIGINT; break; default: return -EINVAL; } read_lock(&tasklist_lock); send_sig(SIGKILL, pid_ns->child_reaper, 1); read_unlock(&tasklist_lock); do_exit(0); /* Not reached */ return 0; } static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) { return container_of(ns, struct pid_namespace, ns); } static struct ns_common *pidns_get(struct task_struct *task) { struct pid_namespace *ns; rcu_read_lock(); ns = task_active_pid_ns(task); if (ns) get_pid_ns(ns); rcu_read_unlock(); return ns ? &ns->ns : NULL; } static struct ns_common *pidns_for_children_get(struct task_struct *task) { struct pid_namespace *ns = NULL; task_lock(task); if (task->nsproxy) { ns = task->nsproxy->pid_ns_for_children; get_pid_ns(ns); } task_unlock(task); if (ns) { read_lock(&tasklist_lock); if (!ns->child_reaper) { put_pid_ns(ns); ns = NULL; } read_unlock(&tasklist_lock); } return ns ? &ns->ns : NULL; } static void pidns_put(struct ns_common *ns) { put_pid_ns(to_pid_ns(ns)); } static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns) { struct pid_namespace *active = task_active_pid_ns(current); struct pid_namespace *ancestor, *new = to_pid_ns(ns); if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) return -EPERM; /* * Only allow entering the current active pid namespace * or a child of the current active pid namespace. * * This is required for fork to return a usable pid value and * this maintains the property that processes and their * children can not escape their current pid namespace. */ if (new->level < active->level) return -EINVAL; ancestor = new; while (ancestor->level > active->level) ancestor = ancestor->parent; if (ancestor != active) return -EINVAL; put_pid_ns(nsproxy->pid_ns_for_children); nsproxy->pid_ns_for_children = get_pid_ns(new); return 0; } static struct ns_common *pidns_get_parent(struct ns_common *ns) { struct pid_namespace *active = task_active_pid_ns(current); struct pid_namespace *pid_ns, *p; /* See if the parent is in the current namespace */ pid_ns = p = to_pid_ns(ns)->parent; for (;;) { if (!p) return ERR_PTR(-EPERM); if (p == active) break; p = p->parent; } return &get_pid_ns(pid_ns)->ns; } static struct user_namespace *pidns_owner(struct ns_common *ns) { return to_pid_ns(ns)->user_ns; } const struct proc_ns_operations pidns_operations = { .name = "pid", .type = CLONE_NEWPID, .get = pidns_get, .put = pidns_put, .install = pidns_install, .owner = pidns_owner, .get_parent = pidns_get_parent, }; const struct proc_ns_operations pidns_for_children_operations = { .name = "pid_for_children", .real_ns_name = "pid", .type = CLONE_NEWPID, .get = pidns_for_children_get, .put = pidns_put, .install = pidns_install, .owner = pidns_owner, .get_parent = pidns_get_parent, }; static __init int pid_namespaces_init(void) { pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); #ifdef CONFIG_CHECKPOINT_RESTORE register_sysctl_paths(kern_path, pid_ns_ctl_table); #endif return 0; } __initcall(pid_namespaces_init); |
8 8 8 8 8 8 8 7 8 8 8 8 8 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 | /* * Copyright (c) 2016 Intel Corporation * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that * the above copyright notice appear in all copies and that both that copyright * notice and this permission notice appear in supporting documentation, and * that the name of the copyright holders not be used in advertising or * publicity pertaining to distribution of the software without specific, * written prior permission. The copyright holders make no representations * about the suitability of this software for any purpose. It is provided "as * is" without express or implied warranty. * * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE * OF THIS SOFTWARE. */ #include <drm/drmP.h> #include <drm/drm_connector.h> #include <drm/drm_edid.h> #include <drm/drm_encoder.h> #include <drm/drm_utils.h> #include "drm_crtc_internal.h" #include "drm_internal.h" /** * DOC: overview * * In DRM connectors are the general abstraction for display sinks, and include * als fixed panels or anything else that can display pixels in some form. As * opposed to all other KMS objects representing hardware (like CRTC, encoder or * plane abstractions) connectors can be hotplugged and unplugged at runtime. * Hence they are reference-counted using drm_connector_get() and * drm_connector_put(). * * KMS driver must create, initialize, register and attach at a &struct * drm_connector for each such sink. The instance is created as other KMS * objects and initialized by setting the following fields. The connector is * initialized with a call to drm_connector_init() with a pointer to the * &struct drm_connector_funcs and a connector type, and then exposed to * userspace with a call to drm_connector_register(). * * Connectors must be attached to an encoder to be used. For devices that map * connectors to encoders 1:1, the connector should be attached at * initialization time with a call to drm_connector_attach_encoder(). The * driver must also set the &drm_connector.encoder field to point to the * attached encoder. * * For connectors which are not fixed (like built-in panels) the driver needs to * support hotplug notifications. The simplest way to do that is by using the * probe helpers, see drm_kms_helper_poll_init() for connectors which don't have * hardware support for hotplug interrupts. Connectors with hardware hotplug * support can instead use e.g. drm_helper_hpd_irq_event(). */ struct drm_conn_prop_enum_list { int type; const char *name; struct ida ida; }; /* * Connector and encoder types. */ static struct drm_conn_prop_enum_list drm_connector_enum_list[] = { { DRM_MODE_CONNECTOR_Unknown, "Unknown" }, { DRM_MODE_CONNECTOR_VGA, "VGA" }, { DRM_MODE_CONNECTOR_DVII, "DVI-I" }, { DRM_MODE_CONNECTOR_DVID, "DVI-D" }, { DRM_MODE_CONNECTOR_DVIA, "DVI-A" }, { DRM_MODE_CONNECTOR_Composite, "Composite" }, { DRM_MODE_CONNECTOR_SVIDEO, "SVIDEO" }, { DRM_MODE_CONNECTOR_LVDS, "LVDS" }, { DRM_MODE_CONNECTOR_Component, "Component" }, { DRM_MODE_CONNECTOR_9PinDIN, "DIN" }, { DRM_MODE_CONNECTOR_DisplayPort, "DP" }, { DRM_MODE_CONNECTOR_HDMIA, "HDMI-A" }, { DRM_MODE_CONNECTOR_HDMIB, "HDMI-B" }, { DRM_MODE_CONNECTOR_TV, "TV" }, { DRM_MODE_CONNECTOR_eDP, "eDP" }, { DRM_MODE_CONNECTOR_VIRTUAL, "Virtual" }, { DRM_MODE_CONNECTOR_DSI, "DSI" }, { DRM_MODE_CONNECTOR_DPI, "DPI" }, { DRM_MODE_CONNECTOR_WRITEBACK, "Writeback" }, }; void drm_connector_ida_init(void) { int i; for (i = 0; i < ARRAY_SIZE(drm_connector_enum_list); i++) ida_init(&drm_connector_enum_list[i].ida); } void drm_connector_ida_destroy(void) { int i; for (i = 0; i < ARRAY_SIZE(drm_connector_enum_list); i++) ida_destroy(&drm_connector_enum_list[i].ida); } /** * drm_connector_get_cmdline_mode - reads the user's cmdline mode * @connector: connector to quwery * * The kernel supports per-connector configuration of its consoles through * use of the video= parameter. This function parses that option and * extracts the user's specified mode (or enable/disable status) for a * particular connector. This is typically only used during the early fbdev * setup. */ static void drm_connector_get_cmdline_mode(struct drm_connector *connector) { struct drm_cmdline_mode *mode = &connector->cmdline_mode; char *option = NULL; if (fb_get_options(connector->name, &option)) return; if (!drm_mode_parse_command_line_for_connector(option, connector, mode)) return; if (mode->force) { DRM_INFO("forcing %s connector %s\n", connector->name, drm_get_connector_force_name(mode->force)); connector->force = mode->force; } DRM_DEBUG_KMS("cmdline mode for connector %s %dx%d@%dHz%s%s%s\n", connector->name, mode->xres, mode->yres, mode->refresh_specified ? mode->refresh : 60, mode->rb ? " reduced blanking" : "", mode->margins ? " with margins" : "", mode->interlace ? " interlaced" : ""); } static void drm_connector_free(struct kref *kref) { struct drm_connector *connector = container_of(kref, struct drm_connector, base.refcount); struct drm_device *dev = connector->dev; drm_mode_object_unregister(dev, &connector->base); connector->funcs->destroy(connector); } void drm_connector_free_work_fn(struct work_struct *work) { struct drm_connector *connector, *n; struct drm_device *dev = container_of(work, struct drm_device, mode_config.connector_free_work); struct drm_mode_config *config = &dev->mode_config; unsigned long flags; struct llist_node *freed; spin_lock_irqsave(&config->connector_list_lock, flags); freed = llist_del_all(&config->connector_free_list); spin_unlock_irqrestore(&config->connector_list_lock, flags); llist_for_each_entry_safe(connector, n, freed, free_node) { drm_mode_object_unregister(dev, &connector->base); connector->funcs->destroy(connector); } } /** * drm_connector_init - Init a preallocated connector * @dev: DRM device * @connector: the connector to init * @funcs: callbacks for this connector * @connector_type: user visible type of the connector * * Initialises a preallocated connector. Connectors should be * subclassed as part of driver connector objects. * * Returns: * Zero on success, error code on failure. */ int drm_connector_init(struct drm_device *dev, struct drm_connector *connector, const struct drm_connector_funcs *funcs, int connector_type) { struct drm_mode_config *config = &dev->mode_config; int ret; struct ida *connector_ida = &drm_connector_enum_list[connector_type].ida; WARN_ON(drm_drv_uses_atomic_modeset(dev) && (!funcs->atomic_destroy_state || !funcs->atomic_duplicate_state)); ret = __drm_mode_object_add(dev, &connector->base, DRM_MODE_OBJECT_CONNECTOR, false, drm_connector_free); if (ret) return ret; connector->base.properties = &connector->properties; connector->dev = dev; connector->funcs = funcs; /* connector index is used with 32bit bitmasks */ ret = ida_simple_get(&config->connector_ida, 0, 32, GFP_KERNEL); if (ret < 0) { DRM_DEBUG_KMS("Failed to allocate %s connector index: %d\n", drm_connector_enum_list[connector_type].name, ret); goto out_put; } connector->index = ret; ret = 0; connector->connector_type = connector_type; connector->connector_type_id = ida_simple_get(connector_ida, 1, 0, GFP_KERNEL); if (connector->connector_type_id < 0) { ret = connector->connector_type_id; goto out_put_id; } connector->name = kasprintf(GFP_KERNEL, "%s-%d", drm_connector_enum_list[connector_type].name, connector->connector_type_id); if (!connector->name) { ret = -ENOMEM; goto out_put_type_id; } INIT_LIST_HEAD(&connector->probed_modes); INIT_LIST_HEAD(&connector->modes); mutex_init(&connector->mutex); connector->edid_blob_ptr = NULL; connector->status = connector_status_unknown; connector->display_info.panel_orientation = DRM_MODE_PANEL_ORIENTATION_UNKNOWN; drm_connector_get_cmdline_mode(connector); /* We should add connectors at the end to avoid upsetting the connector * index too much. */ spin_lock_irq(&config->connector_list_lock); list_add_tail(&connector->head, &config->connector_list); config->num_connector++; spin_unlock_irq(&config->connector_list_lock); if (connector_type != DRM_MODE_CONNECTOR_VIRTUAL && connector_type != DRM_MODE_CONNECTOR_WRITEBACK) drm_object_attach_property(&connector->base, config->edid_property, 0); drm_object_attach_property(&connector->base, config->dpms_property, 0); drm_object_attach_property(&connector->base, config->link_status_property, 0); drm_object_attach_property(&connector->base, config->non_desktop_property, 0); if (drm_core_check_feature(dev, DRIVER_ATOMIC)) { drm_object_attach_property(&connector->base, config->prop_crtc_id, 0); } connector->debugfs_entry = NULL; out_put_type_id: if (ret) ida_simple_remove(connector_ida, connector->connector_type_id); out_put_id: if (ret) ida_simple_remove(&config->connector_ida, connector->index); out_put: if (ret) drm_mode_object_unregister(dev, &connector->base); return ret; } EXPORT_SYMBOL(drm_connector_init); /** * drm_connector_attach_encoder - attach a connector to an encoder * @connector: connector to attach * @encoder: encoder to attach @connector to * * This function links up a connector to an encoder. Note that the routing * restrictions between encoders and crtcs are exposed to userspace through the * possible_clones and possible_crtcs bitmasks. * * Returns: * Zero on success, negative errno on failure. */ int drm_connector_attach_encoder(struct drm_connector *connector, struct drm_encoder *encoder) { int i; /* * In the past, drivers have attempted to model the static association * of connector to encoder in simple connector/encoder devices using a * direct assignment of connector->encoder = encoder. This connection * is a logical one and the responsibility of the core, so drivers are * expected not to mess with this. * * Note that the error return should've been enough here, but a large * majority of drivers ignores the return value, so add in a big WARN * to get people's attention. */ if (WARN_ON(connector->encoder)) return -EINVAL; for (i = 0; i < ARRAY_SIZE(connector->encoder_ids); i++) { if (connector->encoder_ids[i] == 0) { connector->encoder_ids[i] = encoder->base.id; return 0; } } return -ENOMEM; } EXPORT_SYMBOL(drm_connector_attach_encoder); /** * drm_connector_has_possible_encoder - check if the connector and encoder are assosicated with each other * @connector: the connector * @encoder: the encoder * * Returns: * True if @encoder is one of the possible encoders for @connector. */ bool drm_connector_has_possible_encoder(struct drm_connector *connector, struct drm_encoder *encoder) { struct drm_encoder *enc; int i; drm_connector_for_each_possible_encoder(connector, enc, i) { if (enc == encoder) return true; } return false; } EXPORT_SYMBOL(drm_connector_has_possible_encoder); static void drm_mode_remove(struct drm_connector *connector, struct drm_display_mode *mode) { list_del(&mode->head); drm_mode_destroy(connector->dev, mode); } /** * drm_connector_cleanup - cleans up an initialised connector * @connector: connector to cleanup * * Cleans up the connector but doesn't free the object. */ void drm_connector_cleanup(struct drm_connector *connector) { struct drm_device *dev = connector->dev; struct drm_display_mode *mode, *t; /* The connector should have been removed from userspace long before * it is finally destroyed. */ if (WARN_ON(connector->registration_state == DRM_CONNECTOR_REGISTERED)) drm_connector_unregister(connector); if (connector->tile_group) { drm_mode_put_tile_group(dev, connector->tile_group); connector->tile_group = NULL; } list_for_each_entry_safe(mode, t, &connector->probed_modes, head) drm_mode_remove(connector, mode); list_for_each_entry_safe(mode, t, &connector->modes, head) drm_mode_remove(connector, mode); ida_simple_remove(&drm_connector_enum_list[connector->connector_type].ida, connector->connector_type_id); ida_simple_remove(&dev->mode_config.connector_ida, connector->index); kfree(connector->display_info.bus_formats); drm_mode_object_unregister(dev, &connector->base); kfree(connector->name); connector->name = NULL; spin_lock_irq(&dev->mode_config.connector_list_lock); list_del(&connector->head); dev->mode_config.num_connector--; spin_unlock_irq(&dev->mode_config.connector_list_lock); WARN_ON(connector->state && !connector->funcs->atomic_destroy_state); if (connector->state && connector->funcs->atomic_destroy_state) connector->funcs->atomic_destroy_state(connector, connector->state); mutex_destroy(&connector->mutex); memset(connector, 0, sizeof(*connector)); if (dev->registered) drm_sysfs_hotplug_event(dev); } EXPORT_SYMBOL(drm_connector_cleanup); /** * drm_connector_register - register a connector * @connector: the connector to register * * Register userspace interfaces for a connector * * Returns: * Zero on success, error code on failure. */ int drm_connector_register(struct drm_connector *connector) { int ret = 0; if (!connector->dev->registered) return 0; mutex_lock(&connector->mutex); if (connector->registration_state != DRM_CONNECTOR_INITIALIZING) goto unlock; ret = drm_sysfs_connector_add(connector); if (ret) goto unlock; ret = drm_debugfs_connector_add(connector); if (ret) { goto err_sysfs; } if (connector->funcs->late_register) { ret = connector->funcs->late_register(connector); if (ret) goto err_debugfs; } drm_mode_object_register(connector->dev, &connector->base); connector->registration_state = DRM_CONNECTOR_REGISTERED; goto unlock; err_debugfs: drm_debugfs_connector_remove(connector); err_sysfs: drm_sysfs_connector_remove(connector); unlock: mutex_unlock(&connector->mutex); return ret; } EXPORT_SYMBOL(drm_connector_register); /** * drm_connector_unregister - unregister a connector * @connector: the connector to unregister * * Unregister userspace interfaces for a connector */ void drm_connector_unregister(struct drm_connector *connector) { mutex_lock(&connector->mutex); if (connector->registration_state != DRM_CONNECTOR_REGISTERED) { mutex_unlock(&connector->mutex); return; } if (connector->funcs->early_unregister) connector->funcs->early_unregister(connector); drm_sysfs_connector_remove(connector); drm_debugfs_connector_remove(connector); connector->registration_state = DRM_CONNECTOR_UNREGISTERED; mutex_unlock(&connector->mutex); } EXPORT_SYMBOL(drm_connector_unregister); void drm_connector_unregister_all(struct drm_device *dev) { struct drm_connector *connector; struct drm_connector_list_iter conn_iter; drm_connector_list_iter_begin(dev, &conn_iter); drm_for_each_connector_iter(connector, &conn_iter) drm_connector_unregister(connector); drm_connector_list_iter_end(&conn_iter); } int drm_connector_register_all(struct drm_device *dev) { struct drm_connector *connector; struct drm_connector_list_iter conn_iter; int ret = 0; drm_connector_list_iter_begin(dev, &conn_iter); drm_for_each_connector_iter(connector, &conn_iter) { ret = drm_connector_register(connector); if (ret) break; } drm_connector_list_iter_end(&conn_iter); if (ret) drm_connector_unregister_all(dev); return ret; } /** * drm_get_connector_status_name - return a string for connector status * @status: connector status to compute name of * * In contrast to the other drm_get_*_name functions this one here returns a * const pointer and hence is threadsafe. */ const char *drm_get_connector_status_name(enum drm_connector_status status) { if (status == connector_status_connected) return "connected"; else if (status == connector_status_disconnected) return "disconnected"; else return "unknown"; } EXPORT_SYMBOL(drm_get_connector_status_name); /** * drm_get_connector_force_name - return a string for connector force * @force: connector force to get name of * * Returns: const pointer to name. */ const char *drm_get_connector_force_name(enum drm_connector_force force) { switch (force) { case DRM_FORCE_UNSPECIFIED: return "unspecified"; case DRM_FORCE_OFF: return "off"; case DRM_FORCE_ON: return "on"; case DRM_FORCE_ON_DIGITAL: return "digital"; default: return "unknown"; } } #ifdef CONFIG_LOCKDEP static struct lockdep_map connector_list_iter_dep_map = { .name = "drm_connector_list_iter" }; #endif /** * drm_connector_list_iter_begin - initialize a connector_list iterator * @dev: DRM device * @iter: connector_list iterator * * Sets @iter up to walk the &drm_mode_config.connector_list of @dev. @iter * must always be cleaned up again by calling drm_connector_list_iter_end(). * Iteration itself happens using drm_connector_list_iter_next() or * drm_for_each_connector_iter(). */ void drm_connector_list_iter_begin(struct drm_device *dev, struct drm_connector_list_iter *iter) { iter->dev = dev; iter->conn = NULL; lock_acquire_shared_recursive(&connector_list_iter_dep_map, 0, 1, NULL, _RET_IP_); } EXPORT_SYMBOL(drm_connector_list_iter_begin); /* * Extra-safe connector put function that works in any context. Should only be * used from the connector_iter functions, where we never really expect to * actually release the connector when dropping our final reference. */ static void __drm_connector_put_safe(struct drm_connector *conn) { struct drm_mode_config *config = &conn->dev->mode_config; lockdep_assert_held(&config->connector_list_lock); if (!refcount_dec_and_test(&conn->base.refcount.refcount)) return; llist_add(&conn->free_node, &config->connector_free_list); schedule_work(&config->connector_free_work); } /** * drm_connector_list_iter_next - return next connector * @iter: connector_list iterator * * Returns the next connector for @iter, or NULL when the list walk has * completed. */ struct drm_connector * drm_connector_list_iter_next(struct drm_connector_list_iter *iter) { struct drm_connector *old_conn = iter->conn; struct drm_mode_config *config = &iter->dev->mode_config; struct list_head *lhead; unsigned long flags; spin_lock_irqsave(&config->connector_list_lock, flags); lhead = old_conn ? &old_conn->head : &config->connector_list; do { if (lhead->next == &config->connector_list) { iter->conn = NULL; break; } lhead = lhead->next; iter->conn = list_entry(lhead, struct drm_connector, head); /* loop until it's not a zombie connector */ } while (!kref_get_unless_zero(&iter->conn->base.refcount)); if (old_conn) __drm_connector_put_safe(old_conn); spin_unlock_irqrestore(&config->connector_list_lock, flags); return iter->conn; } EXPORT_SYMBOL(drm_connector_list_iter_next); /** * drm_connector_list_iter_end - tear down a connector_list iterator * @iter: connector_list iterator * * Tears down @iter and releases any resources (like &drm_connector references) * acquired while walking the list. This must always be called, both when the * iteration completes fully or when it was aborted without walking the entire * list. */ void drm_connector_list_iter_end(struct drm_connector_list_iter *iter) { struct drm_mode_config *config = &iter->dev->mode_config; unsigned long flags; iter->dev = NULL; if (iter->conn) { spin_lock_irqsave(&config->connector_list_lock, flags); __drm_connector_put_safe(iter->conn); spin_unlock_irqrestore(&config->connector_list_lock, flags); } lock_release(&connector_list_iter_dep_map, 0, _RET_IP_); } EXPORT_SYMBOL(drm_connector_list_iter_end); static const struct drm_prop_enum_list drm_subpixel_enum_list[] = { { SubPixelUnknown, "Unknown" }, { SubPixelHorizontalRGB, "Horizontal RGB" }, { SubPixelHorizontalBGR, "Horizontal BGR" }, { SubPixelVerticalRGB, "Vertical RGB" }, { SubPixelVerticalBGR, "Vertical BGR" }, { SubPixelNone, "None" }, }; /** * drm_get_subpixel_order_name - return a string for a given subpixel enum * @order: enum of subpixel_order * * Note you could abuse this and return something out of bounds, but that * would be a caller error. No unscrubbed user data should make it here. */ const char *drm_get_subpixel_order_name(enum subpixel_order order) { return drm_subpixel_enum_list[order].name; } EXPORT_SYMBOL(drm_get_subpixel_order_name); static const struct drm_prop_enum_list drm_dpms_enum_list[] = { { DRM_MODE_DPMS_ON, "On" }, { DRM_MODE_DPMS_STANDBY, "Standby" }, { DRM_MODE_DPMS_SUSPEND, "Suspend" }, { DRM_MODE_DPMS_OFF, "Off" } }; DRM_ENUM_NAME_FN(drm_get_dpms_name, drm_dpms_enum_list) static const struct drm_prop_enum_list drm_link_status_enum_list[] = { { DRM_MODE_LINK_STATUS_GOOD, "Good" }, { DRM_MODE_LINK_STATUS_BAD, "Bad" }, }; /** * drm_display_info_set_bus_formats - set the supported bus formats * @info: display info to store bus formats in * @formats: array containing the supported bus formats * @num_formats: the number of entries in the fmts array * * Store the supported bus formats in display info structure. * See MEDIA_BUS_FMT_* definitions in include/uapi/linux/media-bus-format.h for * a full list of available formats. */ int drm_display_info_set_bus_formats(struct drm_display_info *info, const u32 *formats, unsigned int num_formats) { u32 *fmts = NULL; if (!formats && num_formats) return -EINVAL; if (formats && num_formats) { fmts = kmemdup(formats, sizeof(*formats) * num_formats, GFP_KERNEL); if (!fmts) return -ENOMEM; } kfree(info->bus_formats); info->bus_formats = fmts; info->num_bus_formats = num_formats; return 0; } EXPORT_SYMBOL(drm_display_info_set_bus_formats); /* Optional connector properties. */ static const struct drm_prop_enum_list drm_scaling_mode_enum_list[] = { { DRM_MODE_SCALE_NONE, "None" }, { DRM_MODE_SCALE_FULLSCREEN, "Full" }, { DRM_MODE_SCALE_CENTER, "Center" }, { DRM_MODE_SCALE_ASPECT, "Full aspect" }, }; static const struct drm_prop_enum_list drm_aspect_ratio_enum_list[] = { { DRM_MODE_PICTURE_ASPECT_NONE, "Automatic" }, { DRM_MODE_PICTURE_ASPECT_4_3, "4:3" }, { DRM_MODE_PICTURE_ASPECT_16_9, "16:9" }, }; static const struct drm_prop_enum_list drm_content_type_enum_list[] = { { DRM_MODE_CONTENT_TYPE_NO_DATA, "No Data" }, { DRM_MODE_CONTENT_TYPE_GRAPHICS, "Graphics" }, { DRM_MODE_CONTENT_TYPE_PHOTO, "Photo" }, { DRM_MODE_CONTENT_TYPE_CINEMA, "Cinema" }, { DRM_MODE_CONTENT_TYPE_GAME, "Game" }, }; static const struct drm_prop_enum_list drm_panel_orientation_enum_list[] = { { DRM_MODE_PANEL_ORIENTATION_NORMAL, "Normal" }, { DRM_MODE_PANEL_ORIENTATION_BOTTOM_UP, "Upside Down" }, { DRM_MODE_PANEL_ORIENTATION_LEFT_UP, "Left Side Up" }, { DRM_MODE_PANEL_ORIENTATION_RIGHT_UP, "Right Side Up" }, }; static const struct drm_prop_enum_list drm_dvi_i_select_enum_list[] = { { DRM_MODE_SUBCONNECTOR_Automatic, "Automatic" }, /* DVI-I and TV-out */ { DRM_MODE_SUBCONNECTOR_DVID, "DVI-D" }, /* DVI-I */ { DRM_MODE_SUBCONNECTOR_DVIA, "DVI-A" }, /* DVI-I */ }; DRM_ENUM_NAME_FN(drm_get_dvi_i_select_name, drm_dvi_i_select_enum_list) static const struct drm_prop_enum_list drm_dvi_i_subconnector_enum_list[] = { { DRM_MODE_SUBCONNECTOR_Unknown, "Unknown" }, /* DVI-I and TV-out */ { DRM_MODE_SUBCONNECTOR_DVID, "DVI-D" }, /* DVI-I */ { DRM_MODE_SUBCONNECTOR_DVIA, "DVI-A" }, /* DVI-I */ }; DRM_ENUM_NAME_FN(drm_get_dvi_i_subconnector_name, drm_dvi_i_subconnector_enum_list) static const struct drm_prop_enum_list drm_tv_select_enum_list[] = { { DRM_MODE_SUBCONNECTOR_Automatic, "Automatic" }, /* DVI-I and TV-out */ { DRM_MODE_SUBCONNECTOR_Composite, "Composite" }, /* TV-out */ { DRM_MODE_SUBCONNECTOR_SVIDEO, "SVIDEO" }, /* TV-out */ { DRM_MODE_SUBCONNECTOR_Component, "Component" }, /* TV-out */ { DRM_MODE_SUBCONNECTOR_SCART, "SCART" }, /* TV-out */ }; DRM_ENUM_NAME_FN(drm_get_tv_select_name, drm_tv_select_enum_list) static const struct drm_prop_enum_list drm_tv_subconnector_enum_list[] = { { DRM_MODE_SUBCONNECTOR_Unknown, "Unknown" }, /* DVI-I and TV-out */ { DRM_MODE_SUBCONNECTOR_Composite, "Composite" }, /* TV-out */ { DRM_MODE_SUBCONNECTOR_SVIDEO, "SVIDEO" }, /* TV-out */ { DRM_MODE_SUBCONNECTOR_Component, "Component" }, /* TV-out */ { DRM_MODE_SUBCONNECTOR_SCART, "SCART" }, /* TV-out */ }; DRM_ENUM_NAME_FN(drm_get_tv_subconnector_name, drm_tv_subconnector_enum_list) static struct drm_prop_enum_list drm_cp_enum_list[] = { { DRM_MODE_CONTENT_PROTECTION_UNDESIRED, "Undesired" }, { DRM_MODE_CONTENT_PROTECTION_DESIRED, "Desired" }, { DRM_MODE_CONTENT_PROTECTION_ENABLED, "Enabled" }, }; DRM_ENUM_NAME_FN(drm_get_content_protection_name, drm_cp_enum_list) /** * DOC: standard connector properties * * DRM connectors have a few standardized properties: * * EDID: * Blob property which contains the current EDID read from the sink. This * is useful to parse sink identification information like vendor, model * and serial. Drivers should update this property by calling * drm_connector_update_edid_property(), usually after having parsed * the EDID using drm_add_edid_modes(). Userspace cannot change this * property. * DPMS: * Legacy property for setting the power state of the connector. For atomic * drivers this is only provided for backwards compatibility with existing * drivers, it remaps to controlling the "ACTIVE" property on the CRTC the * connector is linked to. Drivers should never set this property directly, * it is handled by the DRM core by calling the &drm_connector_funcs.dpms * callback. For atomic drivers the remapping to the "ACTIVE" property is * implemented in the DRM core. This is the only standard connector * property that userspace can change. * * Note that this property cannot be set through the MODE_ATOMIC ioctl, * userspace must use "ACTIVE" on the CRTC instead. * * WARNING: * * For userspace also running on legacy drivers the "DPMS" semantics are a * lot more complicated. First, userspace cannot rely on the "DPMS" value * returned by the GETCONNECTOR actually reflecting reality, because many * drivers fail to update it. For atomic drivers this is taken care of in * drm_atomic_helper_update_legacy_modeset_state(). * * The second issue is that the DPMS state is only well-defined when the * connector is connected to a CRTC. In atomic the DRM core enforces that * "ACTIVE" is off in such a case, no such checks exists for "DPMS". * * Finally, when enabling an output using the legacy SETCONFIG ioctl then * "DPMS" is forced to ON. But see above, that might not be reflected in * the software value on legacy drivers. * * Summarizing: Only set "DPMS" when the connector is known to be enabled, * assume that a successful SETCONFIG call also sets "DPMS" to on, and * never read back the value of "DPMS" because it can be incorrect. * PATH: * Connector path property to identify how this sink is physically * connected. Used by DP MST. This should be set by calling * drm_connector_set_path_property(), in the case of DP MST with the * path property the MST manager created. Userspace cannot change this * property. * TILE: * Connector tile group property to indicate how a set of DRM connector * compose together into one logical screen. This is used by both high-res * external screens (often only using a single cable, but exposing multiple * DP MST sinks), or high-res integrated panels (like dual-link DSI) which * are not gen-locked. Note that for tiled panels which are genlocked, like * dual-link LVDS or dual-link DSI, the driver should try to not expose the * tiling and virtualize both &drm_crtc and &drm_plane if needed. Drivers * should update this value using drm_connector_set_tile_property(). * Userspace cannot change this property. * link-status: * Connector link-status property to indicate the status of link. The * default value of link-status is "GOOD". If something fails during or * after modeset, the kernel driver may set this to "BAD" and issue a * hotplug uevent. Drivers should update this value using * drm_connector_set_link_status_property(). * non_desktop: * Indicates the output should be ignored for purposes of displaying a * standard desktop environment or console. This is most likely because * the output device is not rectilinear. * Content Protection: * This property is used by userspace to request the kernel protect future * content communicated over the link. When requested, kernel will apply * the appropriate means of protection (most often HDCP), and use the * property to tell userspace the protection is active. * * Drivers can set this up by calling * drm_connector_attach_content_protection_property() on initialization. * * The value of this property can be one of the following: * * DRM_MODE_CONTENT_PROTECTION_UNDESIRED = 0 * The link is not protected, content is transmitted in the clear. * DRM_MODE_CONTENT_PROTECTION_DESIRED = 1 * Userspace has requested content protection, but the link is not * currently protected. When in this state, kernel should enable * Content Protection as soon as possible. * DRM_MODE_CONTENT_PROTECTION_ENABLED = 2 * Userspace has requested content protection, and the link is * protected. Only the driver can set the property to this value. * If userspace attempts to set to ENABLED, kernel will return * -EINVAL. * * A few guidelines: * * - DESIRED state should be preserved until userspace de-asserts it by * setting the property to UNDESIRED. This means ENABLED should only * transition to UNDESIRED when the user explicitly requests it. * - If the state is DESIRED, kernel should attempt to re-authenticate the * link whenever possible. This includes across disable/enable, dpms, * hotplug, downstream device changes, link status failures, etc.. * - Userspace is responsible for polling the property to determine when * the value transitions from ENABLED to DESIRED. This signifies the link * is no longer protected and userspace should take appropriate action * (whatever that might be). * * Connectors also have one standardized atomic property: * * CRTC_ID: * Mode object ID of the &drm_crtc this connector should be connected to. * * Connectors for LCD panels may also have one standardized property: * * panel orientation: * On some devices the LCD panel is mounted in the casing in such a way * that the up/top side of the panel does not match with the top side of * the device. Userspace can use this property to check for this. * Note that input coordinates from touchscreens (input devices with * INPUT_PROP_DIRECT) will still map 1:1 to the actual LCD panel * coordinates, so if userspace rotates the picture to adjust for * the orientation it must also apply the same transformation to the * touchscreen input coordinates. This property is initialized by calling * drm_connector_init_panel_orientation_property(). * * scaling mode: * This property defines how a non-native mode is upscaled to the native * mode of an LCD panel: * * None: * No upscaling happens, scaling is left to the panel. Not all * drivers expose this mode. * Full: * The output is upscaled to the full resolution of the panel, * ignoring the aspect ratio. * Center: * No upscaling happens, the output is centered within the native * resolution the panel. * Full aspect: * The output is upscaled to maximize either the width or height * while retaining the aspect ratio. * * This property should be set up by calling * drm_connector_attach_scaling_mode_property(). Note that drivers * can also expose this property to external outputs, in which case they * must support "None", which should be the default (since external screens * have a built-in scaler). */ int drm_connector_create_standard_properties(struct drm_device *dev) { struct drm_property *prop; prop = drm_property_create(dev, DRM_MODE_PROP_BLOB | DRM_MODE_PROP_IMMUTABLE, "EDID", 0); if (!prop) return -ENOMEM; dev->mode_config.edid_property = prop; prop = drm_property_create_enum(dev, 0, "DPMS", drm_dpms_enum_list, ARRAY_SIZE(drm_dpms_enum_list)); if (!prop) return -ENOMEM; dev->mode_config.dpms_property = prop; prop = drm_property_create(dev, DRM_MODE_PROP_BLOB | DRM_MODE_PROP_IMMUTABLE, "PATH", 0); if (!prop) return -ENOMEM; dev->mode_config.path_property = prop; prop = drm_property_create(dev, DRM_MODE_PROP_BLOB | DRM_MODE_PROP_IMMUTABLE, "TILE", 0); if (!prop) return -ENOMEM; dev->mode_config.tile_property = prop; prop = drm_property_create_enum(dev, 0, "link-status", drm_link_status_enum_list, ARRAY_SIZE(drm_link_status_enum_list)); if (!prop) return -ENOMEM; dev->mode_config.link_status_property = prop; prop = drm_property_create_bool(dev, DRM_MODE_PROP_IMMUTABLE, "non-desktop"); if (!prop) return -ENOMEM; dev->mode_config.non_desktop_property = prop; return 0; } /** * drm_mode_create_dvi_i_properties - create DVI-I specific connector properties * @dev: DRM device * * Called by a driver the first time a DVI-I connector is made. */ int drm_mode_create_dvi_i_properties(struct drm_device *dev) { struct drm_property *dvi_i_selector; struct drm_property *dvi_i_subconnector; if (dev->mode_config.dvi_i_select_subconnector_property) return 0; dvi_i_selector = drm_property_create_enum(dev, 0, "select subconnector", drm_dvi_i_select_enum_list, ARRAY_SIZE(drm_dvi_i_select_enum_list)); dev->mode_config.dvi_i_select_subconnector_property = dvi_i_selector; dvi_i_subconnector = drm_property_create_enum(dev, DRM_MODE_PROP_IMMUTABLE, "subconnector", drm_dvi_i_subconnector_enum_list, ARRAY_SIZE(drm_dvi_i_subconnector_enum_list)); dev->mode_config.dvi_i_subconnector_property = dvi_i_subconnector; return 0; } EXPORT_SYMBOL(drm_mode_create_dvi_i_properties); /** * DOC: HDMI connector properties * * content type (HDMI specific): * Indicates content type setting to be used in HDMI infoframes to indicate * content type for the external device, so that it adjusts it's display * settings accordingly. * * The value of this property can be one of the following: * * No Data: * Content type is unknown * Graphics: * Content type is graphics * Photo: * Content type is photo * Cinema: * Content type is cinema * Game: * Content type is game * * Drivers can set up this property by calling * drm_connector_attach_content_type_property(). Decoding to * infoframe values is done through drm_hdmi_avi_infoframe_content_type(). */ /** * drm_connector_attach_content_type_property - attach content-type property * @connector: connector to attach content type property on. * * Called by a driver the first time a HDMI connector is made. */ int drm_connector_attach_content_type_property(struct drm_connector *connector) { if (!drm_mode_create_content_type_property(connector->dev)) drm_object_attach_property(&connector->base, connector->dev->mode_config.content_type_property, DRM_MODE_CONTENT_TYPE_NO_DATA); return 0; } EXPORT_SYMBOL(drm_connector_attach_content_type_property); /** * drm_hdmi_avi_infoframe_content_type() - fill the HDMI AVI infoframe * content type information, based * on correspondent DRM property. * @frame: HDMI AVI infoframe * @conn_state: DRM display connector state * */ void drm_hdmi_avi_infoframe_content_type(struct hdmi_avi_infoframe *frame, const struct drm_connector_state *conn_state) { switch (conn_state->content_type) { case DRM_MODE_CONTENT_TYPE_GRAPHICS: frame->content_type = HDMI_CONTENT_TYPE_GRAPHICS; break; case DRM_MODE_CONTENT_TYPE_CINEMA: frame->content_type = HDMI_CONTENT_TYPE_CINEMA; break; case DRM_MODE_CONTENT_TYPE_GAME: frame->content_type = HDMI_CONTENT_TYPE_GAME; break; case DRM_MODE_CONTENT_TYPE_PHOTO: frame->content_type = HDMI_CONTENT_TYPE_PHOTO; break; default: /* Graphics is the default(0) */ frame->content_type = HDMI_CONTENT_TYPE_GRAPHICS; } frame->itc = conn_state->content_type != DRM_MODE_CONTENT_TYPE_NO_DATA; } EXPORT_SYMBOL(drm_hdmi_avi_infoframe_content_type); /** * drm_create_tv_properties - create TV specific connector properties * @dev: DRM device * @num_modes: number of different TV formats (modes) supported * @modes: array of pointers to strings containing name of each format * * Called by a driver's TV initialization routine, this function creates * the TV specific connector properties for a given device. Caller is * responsible for allocating a list of format names and passing them to * this routine. */ int drm_mode_create_tv_properties(struct drm_device *dev, unsigned int num_modes, const char * const modes[]) { struct drm_property *tv_selector; struct drm_property *tv_subconnector; unsigned int i; if (dev->mode_config.tv_select_subconnector_property) return 0; /* * Basic connector properties */ tv_selector = drm_property_create_enum(dev, 0, "select subconnector", drm_tv_select_enum_list, ARRAY_SIZE(drm_tv_select_enum_list)); if (!tv_selector) goto nomem; dev->mode_config.tv_select_subconnector_property = tv_selector; tv_subconnector = drm_property_create_enum(dev, DRM_MODE_PROP_IMMUTABLE, "subconnector", drm_tv_subconnector_enum_list, ARRAY_SIZE(drm_tv_subconnector_enum_list)); if (!tv_subconnector) goto nomem; dev->mode_config.tv_subconnector_property = tv_subconnector; /* * Other, TV specific properties: margins & TV modes. */ dev->mode_config.tv_left_margin_property = drm_property_create_range(dev, 0, "left margin", 0, 100); if (!dev->mode_config.tv_left_margin_property) goto nomem; dev->mode_config.tv_right_margin_property = drm_property_create_range(dev, 0, "right margin", 0, 100); if (!dev->mode_config.tv_right_margin_property) goto nomem; dev->mode_config.tv_top_margin_property = drm_property_create_range(dev, 0, "top margin", 0, 100); if (!dev->mode_config.tv_top_margin_property) goto nomem; dev->mode_config.tv_bottom_margin_property = drm_property_create_range(dev, 0, "bottom margin", 0, 100); if (!dev->mode_config.tv_bottom_margin_property) goto nomem; dev->mode_config.tv_mode_property = drm_property_create(dev, DRM_MODE_PROP_ENUM, "mode", num_modes); if (!dev->mode_config.tv_mode_property) goto nomem; for (i = 0; i < num_modes; i++) drm_property_add_enum(dev->mode_config.tv_mode_property, i, modes[i]); dev->mode_config.tv_brightness_property = drm_property_create_range(dev, 0, "brightness", 0, 100); if (!dev->mode_config.tv_brightness_property) goto nomem; dev->mode_config.tv_contrast_property = drm_property_create_range(dev, 0, "contrast", 0, 100); if (!dev->mode_config.tv_contrast_property) goto nomem; dev->mode_config.tv_flicker_reduction_property = drm_property_create_range(dev, 0, "flicker reduction", 0, 100); if (!dev->mode_config.tv_flicker_reduction_property) goto nomem; dev->mode_config.tv_overscan_property = drm_property_create_range(dev, 0, "overscan", 0, 100); if (!dev->mode_config.tv_overscan_property) goto nomem; dev->mode_config.tv_saturation_property = drm_property_create_range(dev, 0, "saturation", 0, 100); if (!dev->mode_config.tv_saturation_property) goto nomem; dev->mode_config.tv_hue_property = drm_property_create_range(dev, 0, "hue", 0, 100); if (!dev->mode_config.tv_hue_property) goto nomem; return 0; nomem: return -ENOMEM; } EXPORT_SYMBOL(drm_mode_create_tv_properties); /** * drm_mode_create_scaling_mode_property - create scaling mode property * @dev: DRM device * * Called by a driver the first time it's needed, must be attached to desired * connectors. * * Atomic drivers should use drm_connector_attach_scaling_mode_property() * instead to correctly assign &drm_connector_state.picture_aspect_ratio * in the atomic state. */ int drm_mode_create_scaling_mode_property(struct drm_device *dev) { struct drm_property *scaling_mode; if (dev->mode_config.scaling_mode_property) return 0; scaling_mode = drm_property_create_enum(dev, 0, "scaling mode", drm_scaling_mode_enum_list, ARRAY_SIZE(drm_scaling_mode_enum_list)); dev->mode_config.scaling_mode_property = scaling_mode; return 0; } EXPORT_SYMBOL(drm_mode_create_scaling_mode_property); /** * drm_connector_attach_scaling_mode_property - attach atomic scaling mode property * @connector: connector to attach scaling mode property on. * @scaling_mode_mask: or'ed mask of BIT(%DRM_MODE_SCALE_\*). * * This is used to add support for scaling mode to atomic drivers. * The scaling mode will be set to &drm_connector_state.picture_aspect_ratio * and can be used from &drm_connector_helper_funcs->atomic_check for validation. * * This is the atomic version of drm_mode_create_scaling_mode_property(). * * Returns: * Zero on success, negative errno on failure. */ int drm_connector_attach_scaling_mode_property(struct drm_connector *connector, u32 scaling_mode_mask) { struct drm_device *dev = connector->dev; struct drm_property *scaling_mode_property; int i; const unsigned valid_scaling_mode_mask = (1U << ARRAY_SIZE(drm_scaling_mode_enum_list)) - 1; if (WARN_ON(hweight32(scaling_mode_mask) < 2 || scaling_mode_mask & ~valid_scaling_mode_mask)) return -EINVAL; scaling_mode_property = drm_property_create(dev, DRM_MODE_PROP_ENUM, "scaling mode", hweight32(scaling_mode_mask)); if (!scaling_mode_property) return -ENOMEM; for (i = 0; i < ARRAY_SIZE(drm_scaling_mode_enum_list); i++) { int ret; if (!(BIT(i) & scaling_mode_mask)) continue; ret = drm_property_add_enum(scaling_mode_property, drm_scaling_mode_enum_list[i].type, drm_scaling_mode_enum_list[i].name); if (ret) { drm_property_destroy(dev, scaling_mode_property); return ret; } } drm_object_attach_property(&connector->base, scaling_mode_property, 0); connector->scaling_mode_property = scaling_mode_property; return 0; } EXPORT_SYMBOL(drm_connector_attach_scaling_mode_property); /** * drm_connector_attach_content_protection_property - attach content protection * property * * @connector: connector to attach CP property on. * * This is used to add support for content protection on select connectors. * Content Protection is intentionally vague to allow for different underlying * technologies, however it is most implemented by HDCP. * * The content protection will be set to &drm_connector_state.content_protection * * Returns: * Zero on success, negative errno on failure. */ int drm_connector_attach_content_protection_property( struct drm_connector *connector) { struct drm_device *dev = connector->dev; struct drm_property *prop; prop = drm_property_create_enum(dev, 0, "Content Protection", drm_cp_enum_list, ARRAY_SIZE(drm_cp_enum_list)); if (!prop) return -ENOMEM; drm_object_attach_property(&connector->base, prop, DRM_MODE_CONTENT_PROTECTION_UNDESIRED); connector->content_protection_property = prop; return 0; } EXPORT_SYMBOL(drm_connector_attach_content_protection_property); /** * drm_mode_create_aspect_ratio_property - create aspect ratio property * @dev: DRM device * * Called by a driver the first time it's needed, must be attached to desired * connectors. * * Returns: * Zero on success, negative errno on failure. */ int drm_mode_create_aspect_ratio_property(struct drm_device *dev) { if (dev->mode_config.aspect_ratio_property) return 0; dev->mode_config.aspect_ratio_property = drm_property_create_enum(dev, 0, "aspect ratio", drm_aspect_ratio_enum_list, ARRAY_SIZE(drm_aspect_ratio_enum_list)); if (dev->mode_config.aspect_ratio_property == NULL) return -ENOMEM; return 0; } EXPORT_SYMBOL(drm_mode_create_aspect_ratio_property); /** * drm_mode_create_content_type_property - create content type property * @dev: DRM device * * Called by a driver the first time it's needed, must be attached to desired * connectors. * * Returns: * Zero on success, negative errno on failure. */ int drm_mode_create_content_type_property(struct drm_device *dev) { if (dev->mode_config.content_type_property) return 0; dev->mode_config.content_type_property = drm_property_create_enum(dev, 0, "content type", drm_content_type_enum_list, ARRAY_SIZE(drm_content_type_enum_list)); if (dev->mode_config.content_type_property == NULL) return -ENOMEM; return 0; } EXPORT_SYMBOL(drm_mode_create_content_type_property); /** * drm_mode_create_suggested_offset_properties - create suggests offset properties * @dev: DRM device * * Create the the suggested x/y offset property for connectors. */ int drm_mode_create_suggested_offset_properties(struct drm_device *dev) { if (dev->mode_config.suggested_x_property && dev->mode_config.suggested_y_property) return 0; dev->mode_config.suggested_x_property = drm_property_create_range(dev, DRM_MODE_PROP_IMMUTABLE, "suggested X", 0, 0xffffffff); dev->mode_config.suggested_y_property = drm_property_create_range(dev, DRM_MODE_PROP_IMMUTABLE, "suggested Y", 0, 0xffffffff); if (dev->mode_config.suggested_x_property == NULL || dev->mode_config.suggested_y_property == NULL) return -ENOMEM; return 0; } EXPORT_SYMBOL(drm_mode_create_suggested_offset_properties); /** * drm_connector_set_path_property - set tile property on connector * @connector: connector to set property on. * @path: path to use for property; must not be NULL. * * This creates a property to expose to userspace to specify a * connector path. This is mainly used for DisplayPort MST where * connectors have a topology and we want to allow userspace to give * them more meaningful names. * * Returns: * Zero on success, negative errno on failure. */ int drm_connector_set_path_property(struct drm_connector *connector, const char *path) { struct drm_device *dev = connector->dev; int ret; ret = drm_property_replace_global_blob(dev, &connector->path_blob_ptr, strlen(path) + 1, path, &connector->base, dev->mode_config.path_property); return ret; } EXPORT_SYMBOL(drm_connector_set_path_property); /** * drm_connector_set_tile_property - set tile property on connector * @connector: connector to set property on. * * This looks up the tile information for a connector, and creates a * property for userspace to parse if it exists. The property is of * the form of 8 integers using ':' as a separator. * * Returns: * Zero on success, errno on failure. */ int drm_connector_set_tile_property(struct drm_connector *connector) { struct drm_device *dev = connector->dev; char tile[256]; int ret; if (!connector->has_tile) { ret = drm_property_replace_global_blob(dev, &connector->tile_blob_ptr, 0, NULL, &connector->base, dev->mode_config.tile_property); return ret; } snprintf(tile, 256, "%d:%d:%d:%d:%d:%d:%d:%d", connector->tile_group->id, connector->tile_is_single_monitor, connector->num_h_tile, connector->num_v_tile, connector->tile_h_loc, connector->tile_v_loc, connector->tile_h_size, connector->tile_v_size); ret = drm_property_replace_global_blob(dev, &connector->tile_blob_ptr, strlen(tile) + 1, tile, &connector->base, dev->mode_config.tile_property); return ret; } EXPORT_SYMBOL(drm_connector_set_tile_property); /** * drm_connector_update_edid_property - update the edid property of a connector * @connector: drm connector * @edid: new value of the edid property * * This function creates a new blob modeset object and assigns its id to the * connector's edid property. * * Returns: * Zero on success, negative errno on failure. */ int drm_connector_update_edid_property(struct drm_connector *connector, const struct edid *edid) { struct drm_device *dev = connector->dev; size_t size = 0; int ret; /* ignore requests to set edid when overridden */ if (connector->override_edid) return 0; if (edid) size = EDID_LENGTH * (1 + edid->extensions); /* Set the display info, using edid if available, otherwise * reseting the values to defaults. This duplicates the work * done in drm_add_edid_modes, but that function is not * consistently called before this one in all drivers and the * computation is cheap enough that it seems better to * duplicate it rather than attempt to ensure some arbitrary * ordering of calls. */ if (edid) drm_add_display_info(connector, edid); else drm_reset_display_info(connector); drm_object_property_set_value(&connector->base, dev->mode_config.non_desktop_property, connector->display_info.non_desktop); ret = drm_property_replace_global_blob(dev, &connector->edid_blob_ptr, size, edid, &connector->base, dev->mode_config.edid_property); return ret; } EXPORT_SYMBOL(drm_connector_update_edid_property); /** * drm_connector_set_link_status_property - Set link status property of a connector * @connector: drm connector * @link_status: new value of link status property (0: Good, 1: Bad) * * In usual working scenario, this link status property will always be set to * "GOOD". If something fails during or after a mode set, the kernel driver * may set this link status property to "BAD". The caller then needs to send a * hotplug uevent for userspace to re-check the valid modes through * GET_CONNECTOR_IOCTL and retry modeset. * * Note: Drivers cannot rely on userspace to support this property and * issue a modeset. As such, they may choose to handle issues (like * re-training a link) without userspace's intervention. * * The reason for adding this property is to handle link training failures, but * it is not limited to DP or link training. For example, if we implement * asynchronous setcrtc, this property can be used to report any failures in that. */ void drm_connector_set_link_status_property(struct drm_connector *connector, uint64_t link_status) { struct drm_device *dev = connector->dev; drm_modeset_lock(&dev->mode_config.connection_mutex, NULL); connector->state->link_status = link_status; drm_modeset_unlock(&dev->mode_config.connection_mutex); } EXPORT_SYMBOL(drm_connector_set_link_status_property); /** * drm_connector_init_panel_orientation_property - * initialize the connecters panel_orientation property * @connector: connector for which to init the panel-orientation property. * @width: width in pixels of the panel, used for panel quirk detection * @height: height in pixels of the panel, used for panel quirk detection * * This function should only be called for built-in panels, after setting * connector->display_info.panel_orientation first (if known). * * This function will check for platform specific (e.g. DMI based) quirks * overriding display_info.panel_orientation first, then if panel_orientation * is not DRM_MODE_PANEL_ORIENTATION_UNKNOWN it will attach the * "panel orientation" property to the connector. * * Returns: * Zero on success, negative errno on failure. */ int drm_connector_init_panel_orientation_property( struct drm_connector *connector, int width, int height) { struct drm_device *dev = connector->dev; struct drm_display_info *info = &connector->display_info; struct drm_property *prop; int orientation_quirk; orientation_quirk = drm_get_panel_orientation_quirk(width, height); if (orientation_quirk != DRM_MODE_PANEL_ORIENTATION_UNKNOWN) info->panel_orientation = orientation_quirk; if (info->panel_orientation == DRM_MODE_PANEL_ORIENTATION_UNKNOWN) return 0; prop = dev->mode_config.panel_orientation_property; if (!prop) { prop = drm_property_create_enum(dev, DRM_MODE_PROP_IMMUTABLE, "panel orientation", drm_panel_orientation_enum_list, ARRAY_SIZE(drm_panel_orientation_enum_list)); if (!prop) return -ENOMEM; dev->mode_config.panel_orientation_property = prop; } drm_object_attach_property(&connector->base, prop, info->panel_orientation); return 0; } EXPORT_SYMBOL(drm_connector_init_panel_orientation_property); int drm_connector_set_obj_prop(struct drm_mode_object *obj, struct drm_property *property, uint64_t value) { int ret = -EINVAL; struct drm_connector *connector = obj_to_connector(obj); /* Do DPMS ourselves */ if (property == connector->dev->mode_config.dpms_property) { ret = (*connector->funcs->dpms)(connector, (int)value); } else if (connector->funcs->set_property) ret = connector->funcs->set_property(connector, property, value); if (!ret) drm_object_property_set_value(&connector->base, property, value); return ret; } int drm_connector_property_set_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_mode_connector_set_property *conn_set_prop = data; struct drm_mode_obj_set_property obj_set_prop = { .value = conn_set_prop->value, .prop_id = conn_set_prop->prop_id, .obj_id = conn_set_prop->connector_id, .obj_type = DRM_MODE_OBJECT_CONNECTOR }; /* It does all the locking and checking we need */ return drm_mode_obj_set_property_ioctl(dev, &obj_set_prop, file_priv); } static struct drm_encoder *drm_connector_get_encoder(struct drm_connector *connector) { /* For atomic drivers only state objects are synchronously updated and * protected by modeset locks, so check those first. */ if (connector->state) return connector->state->best_encoder; return connector->encoder; } static bool drm_mode_expose_to_userspace(const struct drm_display_mode *mode, const struct list_head *export_list, const struct drm_file *file_priv) { /* * If user-space hasn't configured the driver to expose the stereo 3D * modes, don't expose them. */ if (!file_priv->stereo_allowed && drm_mode_is_stereo(mode)) return false; /* * If user-space hasn't configured the driver to expose the modes * with aspect-ratio, don't expose them. However if such a mode * is unique, let it be exposed, but reset the aspect-ratio flags * while preparing the list of user-modes. */ if (!file_priv->aspect_ratio_allowed) { struct drm_display_mode *mode_itr; list_for_each_entry(mode_itr, export_list, export_head) if (drm_mode_match(mode_itr, mode, DRM_MODE_MATCH_TIMINGS | DRM_MODE_MATCH_CLOCK | DRM_MODE_MATCH_FLAGS | DRM_MODE_MATCH_3D_FLAGS)) return false; } return true; } int drm_mode_getconnector(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_mode_get_connector *out_resp = data; struct drm_connector *connector; struct drm_encoder *encoder; struct drm_display_mode *mode; int mode_count = 0; int encoders_count = 0; int ret = 0; int copied = 0; int i; struct drm_mode_modeinfo u_mode; struct drm_mode_modeinfo __user *mode_ptr; uint32_t __user *encoder_ptr; LIST_HEAD(export_list); if (!drm_core_check_feature(dev, DRIVER_MODESET)) return -EINVAL; memset(&u_mode, 0, sizeof(struct drm_mode_modeinfo)); connector = drm_connector_lookup(dev, file_priv, out_resp->connector_id); if (!connector) return -ENOENT; drm_connector_for_each_possible_encoder(connector, encoder, i) encoders_count++; if ((out_resp->count_encoders >= encoders_count) && encoders_count) { copied = 0; encoder_ptr = (uint32_t __user *)(unsigned long)(out_resp->encoders_ptr); drm_connector_for_each_possible_encoder(connector, encoder, i) { if (put_user(encoder->base.id, encoder_ptr + copied)) { ret = -EFAULT; goto out; } copied++; } } out_resp->count_encoders = encoders_count; out_resp->connector_id = connector->base.id; out_resp->connector_type = connector->connector_type; out_resp->connector_type_id = connector->connector_type_id; mutex_lock(&dev->mode_config.mutex); if (out_resp->count_modes == 0) { connector->funcs->fill_modes(connector, dev->mode_config.max_width, dev->mode_config.max_height); } out_resp->mm_width = connector->display_info.width_mm; out_resp->mm_height = connector->display_info.height_mm; out_resp->subpixel = connector->display_info.subpixel_order; out_resp->connection = connector->status; /* delayed so we get modes regardless of pre-fill_modes state */ list_for_each_entry(mode, &connector->modes, head) if (drm_mode_expose_to_userspace(mode, &export_list, file_priv)) { list_add_tail(&mode->export_head, &export_list); mode_count++; } /* * This ioctl is called twice, once to determine how much space is * needed, and the 2nd time to fill it. * The modes that need to be exposed to the user are maintained in the * 'export_list'. When the ioctl is called first time to determine the, * space, the export_list gets filled, to find the no.of modes. In the * 2nd time, the user modes are filled, one by one from the export_list. */ if ((out_resp->count_modes >= mode_count) && mode_count) { copied = 0; mode_ptr = (struct drm_mode_modeinfo __user *)(unsigned long)out_resp->modes_ptr; list_for_each_entry(mode, &export_list, export_head) { drm_mode_convert_to_umode(&u_mode, mode); /* * Reset aspect ratio flags of user-mode, if modes with * aspect-ratio are not supported. */ if (!file_priv->aspect_ratio_allowed) u_mode.flags &= ~DRM_MODE_FLAG_PIC_AR_MASK; if (copy_to_user(mode_ptr + copied, &u_mode, sizeof(u_mode))) { ret = -EFAULT; mutex_unlock(&dev->mode_config.mutex); goto out; } copied++; } } out_resp->count_modes = mode_count; mutex_unlock(&dev->mode_config.mutex); drm_modeset_lock(&dev->mode_config.connection_mutex, NULL); encoder = drm_connector_get_encoder(connector); if (encoder) out_resp->encoder_id = encoder->base.id; else out_resp->encoder_id = 0; /* Only grab properties after probing, to make sure EDID and other * properties reflect the latest status. */ ret = drm_mode_object_get_properties(&connector->base, file_priv->atomic, (uint32_t __user *)(unsigned long)(out_resp->props_ptr), (uint64_t __user *)(unsigned long)(out_resp->prop_values_ptr), &out_resp->count_props); drm_modeset_unlock(&dev->mode_config.connection_mutex); out: drm_connector_put(connector); return ret; } /** * DOC: Tile group * * Tile groups are used to represent tiled monitors with a unique integer * identifier. Tiled monitors using DisplayID v1.3 have a unique 8-byte handle, * we store this in a tile group, so we have a common identifier for all tiles * in a monitor group. The property is called "TILE". Drivers can manage tile * groups using drm_mode_create_tile_group(), drm_mode_put_tile_group() and * drm_mode_get_tile_group(). But this is only needed for internal panels where * the tile group information is exposed through a non-standard way. */ static void drm_tile_group_free(struct kref *kref) { struct drm_tile_group *tg = container_of(kref, struct drm_tile_group, refcount); struct drm_device *dev = tg->dev; mutex_lock(&dev->mode_config.idr_mutex); idr_remove(&dev->mode_config.tile_idr, tg->id); mutex_unlock(&dev->mode_config.idr_mutex); kfree(tg); } /** * drm_mode_put_tile_group - drop a reference to a tile group. * @dev: DRM device * @tg: tile group to drop reference to. * * drop reference to tile group and free if 0. */ void drm_mode_put_tile_group(struct drm_device *dev, struct drm_tile_group *tg) { kref_put(&tg->refcount, drm_tile_group_free); } EXPORT_SYMBOL(drm_mode_put_tile_group); /** * drm_mode_get_tile_group - get a reference to an existing tile group * @dev: DRM device * @topology: 8-bytes unique per monitor. * * Use the unique bytes to get a reference to an existing tile group. * * RETURNS: * tile group or NULL if not found. */ struct drm_tile_group *drm_mode_get_tile_group(struct drm_device *dev, char topology[8]) { struct drm_tile_group *tg; int id; mutex_lock(&dev->mode_config.idr_mutex); idr_for_each_entry(&dev->mode_config.tile_idr, tg, id) { if (!memcmp(tg->group_data, topology, 8)) { if (!kref_get_unless_zero(&tg->refcount)) tg = NULL; mutex_unlock(&dev->mode_config.idr_mutex); return tg; } } mutex_unlock(&dev->mode_config.idr_mutex); return NULL; } EXPORT_SYMBOL(drm_mode_get_tile_group); /** * drm_mode_create_tile_group - create a tile group from a displayid description * @dev: DRM device * @topology: 8-bytes unique per monitor. * * Create a tile group for the unique monitor, and get a unique * identifier for the tile group. * * RETURNS: * new tile group or error. */ struct drm_tile_group *drm_mode_create_tile_group(struct drm_device *dev, char topology[8]) { struct drm_tile_group *tg; int ret; tg = kzalloc(sizeof(*tg), GFP_KERNEL); if (!tg) return ERR_PTR(-ENOMEM); kref_init(&tg->refcount); memcpy(tg->group_data, topology, 8); tg->dev = dev; mutex_lock(&dev->mode_config.idr_mutex); ret = idr_alloc(&dev->mode_config.tile_idr, tg, 1, 0, GFP_KERNEL); if (ret >= 0) { tg->id = ret; } else { kfree(tg); tg = ERR_PTR(ret); } mutex_unlock(&dev->mode_config.idr_mutex); return tg; } EXPORT_SYMBOL(drm_mode_create_tile_group); |
237 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BPF_CGROUP_H #define _BPF_CGROUP_H #include <linux/errno.h> #include <linux/jump_label.h> #include <linux/percpu.h> #include <linux/rbtree.h> #include <uapi/linux/bpf.h> struct sock; struct sockaddr; struct cgroup; struct sk_buff; struct bpf_map; struct bpf_prog; struct bpf_sock_ops_kern; struct bpf_cgroup_storage; #ifdef CONFIG_CGROUP_BPF extern struct static_key_false cgroup_bpf_enabled_key; #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key) DECLARE_PER_CPU(void*, bpf_cgroup_storage); struct bpf_cgroup_storage_map; struct bpf_storage_buffer { struct rcu_head rcu; char data[0]; }; struct bpf_cgroup_storage { struct bpf_storage_buffer *buf; struct bpf_cgroup_storage_map *map; struct bpf_cgroup_storage_key key; struct list_head list; struct rb_node node; struct rcu_head rcu; }; struct bpf_prog_list { struct list_head node; struct bpf_prog *prog; struct bpf_cgroup_storage *storage; }; struct bpf_prog_array; struct cgroup_bpf { /* array of effective progs in this cgroup */ struct bpf_prog_array __rcu *effective[MAX_BPF_ATTACH_TYPE]; /* attached progs to this cgroup and attach flags * when flags == 0 or BPF_F_ALLOW_OVERRIDE the progs list will * have either zero or one element * when BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS */ struct list_head progs[MAX_BPF_ATTACH_TYPE]; u32 flags[MAX_BPF_ATTACH_TYPE]; /* temp storage for effective prog array used by prog_attach/detach */ struct bpf_prog_array __rcu *inactive; }; void cgroup_bpf_put(struct cgroup *cgrp); int cgroup_bpf_inherit(struct cgroup *cgrp); int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 flags); int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 flags); int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, union bpf_attr __user *uattr); /* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */ int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 flags); int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 flags); int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, union bpf_attr __user *uattr); int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, enum bpf_attach_type type); int __cgroup_bpf_run_filter_sk(struct sock *sk, enum bpf_attach_type type); int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct sockaddr *uaddr, enum bpf_attach_type type, void *t_ctx); int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct bpf_sock_ops_kern *sock_ops, enum bpf_attach_type type); int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, short access, enum bpf_attach_type type); static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage *storage) { struct bpf_storage_buffer *buf; if (!storage) return; buf = READ_ONCE(storage->buf); this_cpu_write(bpf_cgroup_storage, &buf->data[0]); } struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog); void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage); void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, struct cgroup *cgroup, enum bpf_attach_type type); void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage); int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map); void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map); /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */ #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled) \ __ret = __cgroup_bpf_run_filter_skb(sk, skb, \ BPF_CGROUP_INET_INGRESS); \ \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled && sk && sk == skb->sk) { \ typeof(sk) __sk = sk_to_full_sk(sk); \ if (sk_fullsock(__sk)) \ __ret = __cgroup_bpf_run_filter_skb(__sk, skb, \ BPF_CGROUP_INET_EGRESS); \ } \ __ret; \ }) #define BPF_CGROUP_RUN_SK_PROG(sk, type) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled) { \ __ret = __cgroup_bpf_run_filter_sk(sk, type); \ } \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_CREATE) #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) \ BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET4_POST_BIND) #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) \ BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET6_POST_BIND) #define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled) \ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ NULL); \ __ret; \ }) #define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled) { \ lock_sock(sk); \ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ t_ctx); \ release_sock(sk); \ } \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_BIND) #define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_BIND) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (cgroup_bpf_enabled && \ sk->sk_prot->pre_connect) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_CONNECT) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_CONNECT) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT, NULL) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT, NULL) #define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_SENDMSG, t_ctx) #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_SENDMSG, t_ctx) #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_RECVMSG, NULL) #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_RECVMSG, NULL) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled && (sock_ops)->sk) { \ typeof(sk) __sk = sk_to_full_sk((sock_ops)->sk); \ if (__sk && sk_fullsock(__sk)) \ __ret = __cgroup_bpf_run_filter_sock_ops(__sk, \ sock_ops, \ BPF_CGROUP_SOCK_OPS); \ } \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled) \ __ret = __cgroup_bpf_check_dev_permission(type, major, minor, \ access, \ BPF_CGROUP_DEVICE); \ \ __ret; \ }) int cgroup_bpf_prog_attach(const union bpf_attr *attr, enum bpf_prog_type ptype, struct bpf_prog *prog); int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); int cgroup_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); #else struct bpf_prog; struct cgroup_bpf {}; static inline void cgroup_bpf_put(struct cgroup *cgrp) {} static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } static inline int cgroup_bpf_prog_attach(const union bpf_attr *attr, enum bpf_prog_type ptype, struct bpf_prog *prog) { return -EINVAL; } static inline int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) { return -EINVAL; } static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { return -EINVAL; } static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage *storage) {} static inline int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map) { return 0; } static inline void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map) {} static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( struct bpf_prog *prog) { return 0; } static inline void bpf_cgroup_storage_free( struct bpf_cgroup_storage *storage) {} #define cgroup_bpf_enabled (0) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) #endif /* CONFIG_CGROUP_BPF */ #endif /* _BPF_CGROUP_H */ |
1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 | /* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */ /* * aoenet.c * Ethernet portion of AoE driver */ #include <linux/gfp.h> #include <linux/hdreg.h> #include <linux/blkdev.h> #include <linux/netdevice.h> #include <linux/moduleparam.h> #include <net/net_namespace.h> #include <asm/unaligned.h> #include "aoe.h" #define NECODES 5 static char *aoe_errlist[] = { "no such error", "unrecognized command code", "bad argument parameter", "device unavailable", "config string present", "unsupported version" }; enum { IFLISTSZ = 1024, }; static char aoe_iflist[IFLISTSZ]; module_param_string(aoe_iflist, aoe_iflist, IFLISTSZ, 0600); MODULE_PARM_DESC(aoe_iflist, "aoe_iflist=dev1[,dev2...]"); static wait_queue_head_t txwq; static struct ktstate kts; #ifndef MODULE static int __init aoe_iflist_setup(char *str) { strncpy(aoe_iflist, str, IFLISTSZ); aoe_iflist[IFLISTSZ - 1] = '\0'; return 1; } __setup("aoe_iflist=", aoe_iflist_setup); #endif static spinlock_t txlock; static struct sk_buff_head skbtxq; /* enters with txlock held */ static int tx(int id) __must_hold(&txlock) { struct sk_buff *skb; struct net_device *ifp; while ((skb = skb_dequeue(&skbtxq))) { spin_unlock_irq(&txlock); ifp = skb->dev; if (dev_queue_xmit(skb) == NET_XMIT_DROP && net_ratelimit()) pr_warn("aoe: packet could not be sent on %s. %s\n", ifp ? ifp->name : "netif", "consider increasing tx_queue_len"); spin_lock_irq(&txlock); } return 0; } int is_aoe_netif(struct net_device *ifp) { register char *p, *q; register int len; if (aoe_iflist[0] == '\0') return 1; p = aoe_iflist + strspn(aoe_iflist, WHITESPACE); for (; *p; p = q + strspn(q, WHITESPACE)) { q = p + strcspn(p, WHITESPACE); if (q != p) len = q - p; else len = strlen(p); /* last token in aoe_iflist */ if (strlen(ifp->name) == len && !strncmp(ifp->name, p, len)) return 1; if (q == p) break; } return 0; } int set_aoe_iflist(const char __user *user_str, size_t size) { if (size >= IFLISTSZ) return -EINVAL; if (copy_from_user(aoe_iflist, user_str, size)) { printk(KERN_INFO "aoe: copy from user failed\n"); return -EFAULT; } aoe_iflist[size] = 0x00; return 0; } void aoenet_xmit(struct sk_buff_head *queue) { struct sk_buff *skb, *tmp; ulong flags; skb_queue_walk_safe(queue, skb, tmp) { __skb_unlink(skb, queue); spin_lock_irqsave(&txlock, flags); skb_queue_tail(&skbtxq, skb); spin_unlock_irqrestore(&txlock, flags); wake_up(&txwq); } } /* * (1) len doesn't include the header by default. I want this. */ static int aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, struct net_device *orig_dev) { struct aoe_hdr *h; struct aoe_atahdr *ah; u32 n; int sn; if (dev_net(ifp) != &init_net) goto exit; skb = skb_share_check(skb, GFP_ATOMIC); if (skb == NULL) return 0; if (!is_aoe_netif(ifp)) goto exit; skb_push(skb, ETH_HLEN); /* (1) */ sn = sizeof(*h) + sizeof(*ah); if (skb->len >= sn) { sn -= skb_headlen(skb); if (sn > 0 && !__pskb_pull_tail(skb, sn)) goto exit; } h = (struct aoe_hdr *) skb->data; n = get_unaligned_be32(&h->tag); if ((h->verfl & AOEFL_RSP) == 0 || (n & 1<<31)) goto exit; if (h->verfl & AOEFL_ERR) { n = h->err; if (n > NECODES) n = 0; if (net_ratelimit()) printk(KERN_ERR "%s%d.%d@%s; ecode=%d '%s'\n", "aoe: error packet from ", get_unaligned_be16(&h->major), h->minor, skb->dev->name, h->err, aoe_errlist[n]); goto exit; } switch (h->cmd) { case AOECMD_ATA: /* ata_rsp may keep skb for later processing or give it back */ skb = aoecmd_ata_rsp(skb); break; case AOECMD_CFG: aoecmd_cfg_rsp(skb); break; default: if (h->cmd >= AOECMD_VEND_MIN) break; /* don't complain about vendor commands */ pr_info("aoe: unknown AoE command type 0x%02x\n", h->cmd); break; } if (!skb) return 0; exit: dev_kfree_skb(skb); return 0; } static struct packet_type aoe_pt __read_mostly = { .type = __constant_htons(ETH_P_AOE), .func = aoenet_rcv, }; int __init aoenet_init(void) { skb_queue_head_init(&skbtxq); init_waitqueue_head(&txwq); spin_lock_init(&txlock); kts.lock = &txlock; kts.fn = tx; kts.waitq = &txwq; kts.id = 0; snprintf(kts.name, sizeof(kts.name), "aoe_tx%d", kts.id); if (aoe_ktstart(&kts)) return -EAGAIN; dev_add_pack(&aoe_pt); return 0; } void aoenet_exit(void) { aoe_ktstop(&kts); skb_queue_purge(&skbtxq); dev_remove_pack(&aoe_pt); } |
7 7 1172 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 | /* * Cryptographic API. * * SHA1 Secure Hash Algorithm. * * Derived from cryptoapi implementation, adapted for in-place * scatterlist interface. * * Copyright (c) Alan Smithee. * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> * Copyright (c) Jean-Francois Dive <jef@linuxbe.org> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * */ #include <crypto/internal/hash.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/cryptohash.h> #include <linux/types.h> #include <crypto/sha.h> #include <crypto/sha1_base.h> #include <asm/byteorder.h> const u8 sha1_zero_message_hash[SHA1_DIGEST_SIZE] = { 0xda, 0x39, 0xa3, 0xee, 0x5e, 0x6b, 0x4b, 0x0d, 0x32, 0x55, 0xbf, 0xef, 0x95, 0x60, 0x18, 0x90, 0xaf, 0xd8, 0x07, 0x09 }; EXPORT_SYMBOL_GPL(sha1_zero_message_hash); static void sha1_generic_block_fn(struct sha1_state *sst, u8 const *src, int blocks) { u32 temp[SHA_WORKSPACE_WORDS]; while (blocks--) { sha_transform(sst->state, src, temp); src += SHA1_BLOCK_SIZE; } memzero_explicit(temp, sizeof(temp)); } int crypto_sha1_update(struct shash_desc *desc, const u8 *data, unsigned int len) { return sha1_base_do_update(desc, data, len, sha1_generic_block_fn); } EXPORT_SYMBOL(crypto_sha1_update); static int sha1_final(struct shash_desc *desc, u8 *out) { sha1_base_do_finalize(desc, sha1_generic_block_fn); return sha1_base_finish(desc, out); } int crypto_sha1_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { sha1_base_do_update(desc, data, len, sha1_generic_block_fn); return sha1_final(desc, out); } EXPORT_SYMBOL(crypto_sha1_finup); static struct shash_alg alg = { .digestsize = SHA1_DIGEST_SIZE, .init = sha1_base_init, .update = crypto_sha1_update, .final = sha1_final, .finup = crypto_sha1_finup, .descsize = sizeof(struct sha1_state), .base = { .cra_name = "sha1", .cra_driver_name= "sha1-generic", .cra_priority = 100, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, } }; static int __init sha1_generic_mod_init(void) { return crypto_register_shash(&alg); } static void __exit sha1_generic_mod_fini(void) { crypto_unregister_shash(&alg); } module_init(sha1_generic_mod_init); module_exit(sha1_generic_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm"); MODULE_ALIAS_CRYPTO("sha1"); MODULE_ALIAS_CRYPTO("sha1-generic"); |
59 31 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2014 Facebook. All rights reserved. */ #ifndef BTRFS_QGROUP_H #define BTRFS_QGROUP_H #include "ulist.h" #include "delayed-ref.h" /* * Btrfs qgroup overview * * Btrfs qgroup splits into 3 main part: * 1) Reserve * Reserve metadata/data space for incoming operations * Affect how qgroup limit works * * 2) Trace * Tell btrfs qgroup to trace dirty extents. * * Dirty extents including: * - Newly allocated extents * - Extents going to be deleted (in this trans) * - Extents whose owner is going to be modified * * This is the main part affects whether qgroup numbers will stay * consistent. * Btrfs qgroup can trace clean extents and won't cause any problem, * but it will consume extra CPU time, it should be avoided if possible. * * 3) Account * Btrfs qgroup will updates its numbers, based on dirty extents traced * in previous step. * * Normally at qgroup rescan and transaction commit time. */ /* * Record a dirty extent, and info qgroup to update quota on it * TODO: Use kmem cache to alloc it. */ struct btrfs_qgroup_extent_record { struct rb_node node; u64 bytenr; u64 num_bytes; struct ulist *old_roots; }; /* * Qgroup reservation types: * * DATA: * space reserved for data * * META_PERTRANS: * Space reserved for metadata (per-transaction) * Due to the fact that qgroup data is only updated at transaction commit * time, reserved space for metadata must be kept until transaction * commits. * Any metadata reserved that are used in btrfs_start_transaction() should * be of this type. * * META_PREALLOC: * There are cases where metadata space is reserved before starting * transaction, and then btrfs_join_transaction() to get a trans handle. * Any metadata reserved for such usage should be of this type. * And after join_transaction() part (or all) of such reservation should * be converted into META_PERTRANS. */ enum btrfs_qgroup_rsv_type { BTRFS_QGROUP_RSV_DATA = 0, BTRFS_QGROUP_RSV_META_PERTRANS, BTRFS_QGROUP_RSV_META_PREALLOC, BTRFS_QGROUP_RSV_LAST, }; /* * Represents how many bytes we have reserved for this qgroup. * * Each type should have different reservation behavior. * E.g, data follows its io_tree flag modification, while * *currently* meta is just reserve-and-clear during transcation. * * TODO: Add new type for reservation which can survive transaction commit. * Currect metadata reservation behavior is not suitable for such case. */ struct btrfs_qgroup_rsv { u64 values[BTRFS_QGROUP_RSV_LAST]; }; /* * one struct for each qgroup, organized in fs_info->qgroup_tree. */ struct btrfs_qgroup { u64 qgroupid; /* * state */ u64 rfer; /* referenced */ u64 rfer_cmpr; /* referenced compressed */ u64 excl; /* exclusive */ u64 excl_cmpr; /* exclusive compressed */ /* * limits */ u64 lim_flags; /* which limits are set */ u64 max_rfer; u64 max_excl; u64 rsv_rfer; u64 rsv_excl; /* * reservation tracking */ struct btrfs_qgroup_rsv rsv; /* * lists */ struct list_head groups; /* groups this group is member of */ struct list_head members; /* groups that are members of this group */ struct list_head dirty; /* dirty groups */ struct rb_node node; /* tree of qgroups */ /* * temp variables for accounting operations * Refer to qgroup_shared_accounting() for details. */ u64 old_refcnt; u64 new_refcnt; }; /* * For qgroup event trace points only */ #define QGROUP_RESERVE (1<<0) #define QGROUP_RELEASE (1<<1) #define QGROUP_FREE (1<<2) int btrfs_quota_enable(struct btrfs_fs_info *fs_info); int btrfs_quota_disable(struct btrfs_fs_info *fs_info); int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info); int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, bool interruptible); int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst); int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst); int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid); int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid); int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, struct btrfs_qgroup_limit *limit); int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); struct btrfs_delayed_extent_op; /* * Inform qgroup to trace one dirty extent, its info is recorded in @record. * So qgroup can account it at transaction committing time. * * No lock version, caller must acquire delayed ref lock and allocated memory, * then call btrfs_qgroup_trace_extent_post() after exiting lock context. * * Return 0 for success insert * Return >0 for existing record, caller can free @record safely. * Error is not possible */ int btrfs_qgroup_trace_extent_nolock( struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_qgroup_extent_record *record); /* * Post handler after qgroup_trace_extent_nolock(). * * NOTE: Current qgroup does the expensive backref walk at transaction * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming * new transaction. * This is designed to allow btrfs_find_all_roots() to get correct new_roots * result. * * However for old_roots there is no need to do backref walk at that time, * since we search commit roots to walk backref and result will always be * correct. * * Due to the nature of no lock version, we can't do backref there. * So we must call btrfs_qgroup_trace_extent_post() after exiting * spinlock context. * * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result * using current root, then we can move all expensive backref walk out of * transaction committing, but not now as qgroup accounting will be wrong again. */ int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info, struct btrfs_qgroup_extent_record *qrecord); /* * Inform qgroup to trace one dirty extent, specified by @bytenr and * @num_bytes. * So qgroup can account it at commit trans time. * * Better encapsulated version, with memory allocation and backref walk for * commit roots. * So this can sleep. * * Return 0 if the operation is done. * Return <0 for error, like memory allocation failure or invalid parameter * (NULL trans) */ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, gfp_t gfp_flag); /* * Inform qgroup to trace all leaf items of data * * Return 0 for success * Return <0 for error(ENOMEM) */ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, struct extent_buffer *eb); /* * Inform qgroup to trace a whole subtree, including all its child tree * blocks and data. * The root tree block is specified by @root_eb. * * Normally used by relocation(tree block swap) and subvolume deletion. * * Return 0 for success * Return <0 for error(ENOMEM or tree search error) */ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *root_eb, u64 root_gen, int root_level); int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct ulist *old_roots, struct ulist *new_roots); int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans); int btrfs_run_qgroups(struct btrfs_trans_handle *trans); int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, u64 objectid, struct btrfs_qgroup_inherit *inherit); void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes, enum btrfs_qgroup_rsv_type type); static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes) { if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) return; trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes); btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes, BTRFS_QGROUP_RSV_DATA); } #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, u64 rfer, u64 excl); #endif /* New io_tree based accurate qgroup reserve API */ int btrfs_qgroup_reserve_data(struct inode *inode, struct extent_changeset **reserved, u64 start, u64 len); int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len); int btrfs_qgroup_free_data(struct inode *inode, struct extent_changeset *reserved, u64 start, u64 len); int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type, bool enforce); /* Reserve metadata space for pertrans and prealloc type */ static inline int btrfs_qgroup_reserve_meta_pertrans(struct btrfs_root *root, int num_bytes, bool enforce) { return __btrfs_qgroup_reserve_meta(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS, enforce); } static inline int btrfs_qgroup_reserve_meta_prealloc(struct btrfs_root *root, int num_bytes, bool enforce) { return __btrfs_qgroup_reserve_meta(root, num_bytes, BTRFS_QGROUP_RSV_META_PREALLOC, enforce); } void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type); /* Free per-transaction meta reservation for error handling */ static inline void btrfs_qgroup_free_meta_pertrans(struct btrfs_root *root, int num_bytes) { __btrfs_qgroup_free_meta(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS); } /* Pre-allocated meta reservation can be freed at need */ static inline void btrfs_qgroup_free_meta_prealloc(struct btrfs_root *root, int num_bytes) { __btrfs_qgroup_free_meta(root, num_bytes, BTRFS_QGROUP_RSV_META_PREALLOC); } /* * Per-transaction meta reservation should be all freed at transaction commit * time */ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root); /* * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS. * * This is called when preallocated meta reservation needs to be used. * Normally after btrfs_join_transaction() call. */ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes); void btrfs_qgroup_check_reserved_leak(struct inode *inode); #endif |
22 24 24 170 5 29 12 17 29 40 457 436 21 21 21 21 474 26 26 20 20 437 455 719 5 2 2 2 719 26 474 257 68 62 257 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 | // SPDX-License-Identifier: GPL-2.0 #include <linux/crypto.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/tcp.h> #include <linux/rcupdate.h> #include <linux/rculist.h> #include <net/inetpeer.h> #include <net/tcp.h> void tcp_fastopen_init_key_once(struct net *net) { u8 key[TCP_FASTOPEN_KEY_LENGTH]; struct tcp_fastopen_context *ctxt; rcu_read_lock(); ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx); if (ctxt) { rcu_read_unlock(); return; } rcu_read_unlock(); /* tcp_fastopen_reset_cipher publishes the new context * atomically, so we allow this race happening here. * * All call sites of tcp_fastopen_cookie_gen also check * for a valid cookie, so this is an acceptable risk. */ get_random_bytes(key, sizeof(key)); tcp_fastopen_reset_cipher(net, NULL, key, sizeof(key)); } static void tcp_fastopen_ctx_free(struct rcu_head *head) { struct tcp_fastopen_context *ctx = container_of(head, struct tcp_fastopen_context, rcu); crypto_free_cipher(ctx->tfm); kfree(ctx); } void tcp_fastopen_destroy_cipher(struct sock *sk) { struct tcp_fastopen_context *ctx; ctx = rcu_dereference_protected( inet_csk(sk)->icsk_accept_queue.fastopenq.ctx, 1); if (ctx) call_rcu(&ctx->rcu, tcp_fastopen_ctx_free); } void tcp_fastopen_ctx_destroy(struct net *net) { struct tcp_fastopen_context *ctxt; spin_lock(&net->ipv4.tcp_fastopen_ctx_lock); ctxt = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx, lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, NULL); spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock); if (ctxt) call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free); } int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk, void *key, unsigned int len) { struct tcp_fastopen_context *ctx, *octx; struct fastopen_queue *q; int err; ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->tfm = crypto_alloc_cipher("aes", 0, 0); if (IS_ERR(ctx->tfm)) { err = PTR_ERR(ctx->tfm); error: kfree(ctx); pr_err("TCP: TFO aes cipher alloc error: %d\n", err); return err; } err = crypto_cipher_setkey(ctx->tfm, key, len); if (err) { pr_err("TCP: TFO cipher key error: %d\n", err); crypto_free_cipher(ctx->tfm); goto error; } memcpy(ctx->key, key, len); spin_lock(&net->ipv4.tcp_fastopen_ctx_lock); if (sk) { q = &inet_csk(sk)->icsk_accept_queue.fastopenq; octx = rcu_dereference_protected(q->ctx, lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); rcu_assign_pointer(q->ctx, ctx); } else { octx = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx, lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, ctx); } spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock); if (octx) call_rcu(&octx->rcu, tcp_fastopen_ctx_free); return err; } static bool __tcp_fastopen_cookie_gen(struct sock *sk, const void *path, struct tcp_fastopen_cookie *foc) { struct tcp_fastopen_context *ctx; bool ok = false; rcu_read_lock(); ctx = rcu_dereference(inet_csk(sk)->icsk_accept_queue.fastopenq.ctx); if (!ctx) ctx = rcu_dereference(sock_net(sk)->ipv4.tcp_fastopen_ctx); if (ctx) { crypto_cipher_encrypt_one(ctx->tfm, foc->val, path); foc->len = TCP_FASTOPEN_COOKIE_SIZE; ok = true; } rcu_read_unlock(); return ok; } /* Generate the fastopen cookie by doing aes128 encryption on both * the source and destination addresses. Pad 0s for IPv4 or IPv4-mapped-IPv6 * addresses. For the longer IPv6 addresses use CBC-MAC. * * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE. */ static bool tcp_fastopen_cookie_gen(struct sock *sk, struct request_sock *req, struct sk_buff *syn, struct tcp_fastopen_cookie *foc) { if (req->rsk_ops->family == AF_INET) { const struct iphdr *iph = ip_hdr(syn); __be32 path[4] = { iph->saddr, iph->daddr, 0, 0 }; return __tcp_fastopen_cookie_gen(sk, path, foc); } #if IS_ENABLED(CONFIG_IPV6) if (req->rsk_ops->family == AF_INET6) { const struct ipv6hdr *ip6h = ipv6_hdr(syn); struct tcp_fastopen_cookie tmp; if (__tcp_fastopen_cookie_gen(sk, &ip6h->saddr, &tmp)) { struct in6_addr *buf = &tmp.addr; int i; for (i = 0; i < 4; i++) buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i]; return __tcp_fastopen_cookie_gen(sk, buf, foc); } } #endif return false; } /* If an incoming SYN or SYNACK frame contains a payload and/or FIN, * queue this additional data / FIN. */ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); if (TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt) return; skb = skb_clone(skb, GFP_ATOMIC); if (!skb) return; skb_dst_drop(skb); /* segs_in has been initialized to 1 in tcp_create_openreq_child(). * Hence, reset segs_in to 0 before calling tcp_segs_in() * to avoid double counting. Also, tcp_segs_in() expects * skb->len to include the tcp_hdrlen. Hence, it should * be called before __skb_pull(). */ tp->segs_in = 0; tcp_segs_in(tp, skb); __skb_pull(skb, tcp_hdrlen(skb)); sk_forced_mem_schedule(sk, skb->truesize); skb_set_owner_r(skb, sk); TCP_SKB_CB(skb)->seq++; TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN; tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; __skb_queue_tail(&sk->sk_receive_queue, skb); tp->syn_data_acked = 1; /* u64_stats_update_begin(&tp->syncp) not needed here, * as we certainly are not changing upper 32bit value (0) */ tp->bytes_received = skb->len; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) tcp_fin(sk); } static struct sock *tcp_fastopen_create_child(struct sock *sk, struct sk_buff *skb, struct request_sock *req) { struct tcp_sock *tp; struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; struct sock *child; bool own_req; req->num_retrans = 0; req->num_timeout = 0; req->sk = NULL; child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, NULL, &own_req); if (!child) return NULL; spin_lock(&queue->fastopenq.lock); queue->fastopenq.qlen++; spin_unlock(&queue->fastopenq.lock); /* Initialize the child socket. Have to fix some values to take * into account the child is a Fast Open socket and is created * only out of the bits carried in the SYN packet. */ tp = tcp_sk(child); tp->fastopen_rsk = req; tcp_rsk(req)->tfo_listener = true; /* RFC1323: The window in SYN & SYN/ACK segments is never * scaled. So correct it appropriately. */ tp->snd_wnd = ntohs(tcp_hdr(skb)->window); tp->max_window = tp->snd_wnd; /* Activate the retrans timer so that SYNACK can be retransmitted. * The request socket is not added to the ehash * because it's been added to the accept queue directly. */ inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, TCP_TIMEOUT_INIT, TCP_RTO_MAX); refcount_set(&req->rsk_refcnt, 2); /* Now finish processing the fastopen child socket. */ tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; tcp_fastopen_add_skb(child, skb); tcp_rsk(req)->rcv_nxt = tp->rcv_nxt; tp->rcv_wup = tp->rcv_nxt; /* tcp_conn_request() is sending the SYNACK, * and queues the child into listener accept queue. */ return child; } static bool tcp_fastopen_queue_check(struct sock *sk) { struct fastopen_queue *fastopenq; /* Make sure the listener has enabled fastopen, and we don't * exceed the max # of pending TFO requests allowed before trying * to validating the cookie in order to avoid burning CPU cycles * unnecessarily. * * XXX (TFO) - The implication of checking the max_qlen before * processing a cookie request is that clients can't differentiate * between qlen overflow causing Fast Open to be disabled * temporarily vs a server not supporting Fast Open at all. */ fastopenq = &inet_csk(sk)->icsk_accept_queue.fastopenq; if (fastopenq->max_qlen == 0) return false; if (fastopenq->qlen >= fastopenq->max_qlen) { struct request_sock *req1; spin_lock(&fastopenq->lock); req1 = fastopenq->rskq_rst_head; if (!req1 || time_after(req1->rsk_timer.expires, jiffies)) { __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENLISTENOVERFLOW); spin_unlock(&fastopenq->lock); return false; } fastopenq->rskq_rst_head = req1->dl_next; fastopenq->qlen--; spin_unlock(&fastopenq->lock); reqsk_put(req1); } return true; } static bool tcp_fastopen_no_cookie(const struct sock *sk, const struct dst_entry *dst, int flag) { return (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) & flag) || tcp_sk(sk)->fastopen_no_cookie || (dst && dst_metric(dst, RTAX_FASTOPEN_NO_COOKIE)); } /* Returns true if we should perform Fast Open on the SYN. The cookie (foc) * may be updated and return the client in the SYN-ACK later. E.g., Fast Open * cookie request (foc->len == 0). */ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct tcp_fastopen_cookie *foc, const struct dst_entry *dst) { bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1; int tcp_fastopen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen); struct tcp_fastopen_cookie valid_foc = { .len = -1 }; struct sock *child; if (foc->len == 0) /* Client requests a cookie */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD); if (!((tcp_fastopen & TFO_SERVER_ENABLE) && (syn_data || foc->len >= 0) && tcp_fastopen_queue_check(sk))) { foc->len = -1; return NULL; } if (tcp_fastopen_no_cookie(sk, dst, TFO_SERVER_COOKIE_NOT_REQD)) goto fastopen; if (foc->len >= 0 && /* Client presents or requests a cookie */ tcp_fastopen_cookie_gen(sk, req, skb, &valid_foc) && foc->len == TCP_FASTOPEN_COOKIE_SIZE && foc->len == valid_foc.len && !memcmp(foc->val, valid_foc.val, foc->len)) { /* Cookie is valid. Create a (full) child socket to accept * the data in SYN before returning a SYN-ACK to ack the * data. If we fail to create the socket, fall back and * ack the ISN only but includes the same cookie. * * Note: Data-less SYN with valid cookie is allowed to send * data in SYN_RECV state. */ fastopen: child = tcp_fastopen_create_child(sk, skb, req); if (child) { foc->len = -1; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE); return child; } NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); } else if (foc->len > 0) /* Client presents an invalid cookie */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); valid_foc.exp = foc->exp; *foc = valid_foc; return NULL; } bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie) { const struct dst_entry *dst; tcp_fastopen_cache_get(sk, mss, cookie); /* Firewall blackhole issue check */ if (tcp_fastopen_active_should_disable(sk)) { cookie->len = -1; return false; } dst = __sk_dst_get(sk); if (tcp_fastopen_no_cookie(sk, dst, TFO_CLIENT_NO_COOKIE)) { cookie->len = -1; return true; } return cookie->len > 0; } /* This function checks if we want to defer sending SYN until the first * write(). We defer under the following conditions: * 1. fastopen_connect sockopt is set * 2. we have a valid cookie * Return value: return true if we want to defer until application writes data * return false if we want to send out SYN immediately */ bool tcp_fastopen_defer_connect(struct sock *sk, int *err) { struct tcp_fastopen_cookie cookie = { .len = 0 }; struct tcp_sock *tp = tcp_sk(sk); u16 mss; if (tp->fastopen_connect && !tp->fastopen_req) { if (tcp_fastopen_cookie_check(sk, &mss, &cookie)) { inet_sk(sk)->defer_connect = 1; return true; } /* Alloc fastopen_req in order for FO option to be included * in SYN */ tp->fastopen_req = kzalloc(sizeof(*tp->fastopen_req), sk->sk_allocation); if (tp->fastopen_req) tp->fastopen_req->cookie = cookie; else *err = -ENOBUFS; } return false; } EXPORT_SYMBOL(tcp_fastopen_defer_connect); /* * The following code block is to deal with middle box issues with TFO: * Middlebox firewall issues can potentially cause server's data being * blackholed after a successful 3WHS using TFO. * The proposed solution is to disable active TFO globally under the * following circumstances: * 1. client side TFO socket receives out of order FIN * 2. client side TFO socket receives out of order RST * 3. client side TFO socket has timed out three times consecutively during * or after handshake * We disable active side TFO globally for 1hr at first. Then if it * happens again, we disable it for 2h, then 4h, 8h, ... * And we reset the timeout back to 1hr when we see a successful active * TFO connection with data exchanges. */ /* Disable active TFO and record current jiffies and * tfo_active_disable_times */ void tcp_fastopen_active_disable(struct sock *sk) { struct net *net = sock_net(sk); /* Paired with READ_ONCE() in tcp_fastopen_active_should_disable() */ WRITE_ONCE(net->ipv4.tfo_active_disable_stamp, jiffies); /* Paired with smp_rmb() in tcp_fastopen_active_should_disable(). * We want net->ipv4.tfo_active_disable_stamp to be updated first. */ smp_mb__before_atomic(); atomic_inc(&net->ipv4.tfo_active_disable_times); NET_INC_STATS(net, LINUX_MIB_TCPFASTOPENBLACKHOLE); } /* Calculate timeout for tfo active disable * Return true if we are still in the active TFO disable period * Return false if timeout already expired and we should use active TFO */ bool tcp_fastopen_active_should_disable(struct sock *sk) { unsigned int tfo_bh_timeout = sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout; int tfo_da_times = atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times); unsigned long timeout; int multiplier; if (!tfo_da_times) return false; /* Paired with smp_mb__before_atomic() in tcp_fastopen_active_disable() */ smp_rmb(); /* Limit timout to max: 2^6 * initial timeout */ multiplier = 1 << min(tfo_da_times - 1, 6); /* Paired with the WRITE_ONCE() in tcp_fastopen_active_disable(). */ timeout = READ_ONCE(sock_net(sk)->ipv4.tfo_active_disable_stamp) + multiplier * tfo_bh_timeout * HZ; if (time_before(jiffies, timeout)) return true; /* Mark check bit so we can check for successful active TFO * condition and reset tfo_active_disable_times */ tcp_sk(sk)->syn_fastopen_ch = 1; return false; } /* Disable active TFO if FIN is the only packet in the ofo queue * and no data is received. * Also check if we can reset tfo_active_disable_times if data is * received successfully on a marked active TFO sockets opened on * a non-loopback interface */ void tcp_fastopen_active_disable_ofo_check(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct dst_entry *dst; struct sk_buff *skb; if (!tp->syn_fastopen) return; if (!tp->data_segs_in) { skb = skb_rb_first(&tp->out_of_order_queue); if (skb && !skb_rb_next(skb)) { if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { tcp_fastopen_active_disable(sk); return; } } } else if (tp->syn_fastopen_ch && atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times)) { dst = sk_dst_get(sk); if (!(dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK))) atomic_set(&sock_net(sk)->ipv4.tfo_active_disable_times, 0); dst_release(dst); } } void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired) { u32 timeouts = inet_csk(sk)->icsk_retransmits; struct tcp_sock *tp = tcp_sk(sk); /* Broken middle-boxes may black-hole Fast Open connection during or * even after the handshake. Be extremely conservative and pause * Fast Open globally after hitting the third consecutive timeout or * exceeding the configured timeout limit. */ if ((tp->syn_fastopen || tp->syn_data || tp->syn_data_acked) && (timeouts == 2 || (timeouts < 2 && expired))) { tcp_fastopen_active_disable(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); } } |
13039 20 95 22122 1120 4655 9034 8164 17898 9257 508 6625 3162 1729 434 4468 2016 2077 118 125 36 321 125 8828 13164 9612 20 64 3583 1998 106 8808 15279 963 3032 306 153 1730 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_LIST_H #define _LINUX_LIST_H #include <linux/types.h> #include <linux/stddef.h> #include <linux/poison.h> #include <linux/const.h> #include <linux/kernel.h> /* * Simple doubly linked list implementation. * * Some of the internal functions ("__xxx") are useful when * manipulating whole lists rather than single entries, as * sometimes we already know the next/prev entries and we can * generate better code by using them directly rather than * using the generic single-entry routines. */ #define LIST_HEAD_INIT(name) { &(name), &(name) } #define LIST_HEAD(name) \ struct list_head name = LIST_HEAD_INIT(name) static inline void INIT_LIST_HEAD(struct list_head *list) { WRITE_ONCE(list->next, list); list->prev = list; } #ifdef CONFIG_DEBUG_LIST extern bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next); extern bool __list_del_entry_valid(struct list_head *entry); #else static inline bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next) { return true; } static inline bool __list_del_entry_valid(struct list_head *entry) { return true; } #endif /* * Insert a new entry between two known consecutive entries. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next) { if (!__list_add_valid(new, prev, next)) return; next->prev = new; new->next = next; new->prev = prev; WRITE_ONCE(prev->next, new); } /** * list_add - add a new entry * @new: new entry to be added * @head: list head to add it after * * Insert a new entry after the specified head. * This is good for implementing stacks. */ static inline void list_add(struct list_head *new, struct list_head *head) { __list_add(new, head, head->next); } /** * list_add_tail - add a new entry * @new: new entry to be added * @head: list head to add it before * * Insert a new entry before the specified head. * This is useful for implementing queues. */ static inline void list_add_tail(struct list_head *new, struct list_head *head) { __list_add(new, head->prev, head); } /* * Delete a list entry by making the prev/next entries * point to each other. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_del(struct list_head * prev, struct list_head * next) { next->prev = prev; WRITE_ONCE(prev->next, next); } /** * list_del - deletes entry from list. * @entry: the element to delete from the list. * Note: list_empty() on entry does not return true after this, the entry is * in an undefined state. */ static inline void __list_del_entry(struct list_head *entry) { if (!__list_del_entry_valid(entry)) return; __list_del(entry->prev, entry->next); } static inline void list_del(struct list_head *entry) { __list_del_entry(entry); entry->next = LIST_POISON1; entry->prev = LIST_POISON2; } /** * list_replace - replace old entry by new one * @old : the element to be replaced * @new : the new element to insert * * If @old was empty, it will be overwritten. */ static inline void list_replace(struct list_head *old, struct list_head *new) { new->next = old->next; new->next->prev = new; new->prev = old->prev; new->prev->next = new; } static inline void list_replace_init(struct list_head *old, struct list_head *new) { list_replace(old, new); INIT_LIST_HEAD(old); } /** * list_del_init - deletes entry from list and reinitialize it. * @entry: the element to delete from the list. */ static inline void list_del_init(struct list_head *entry) { __list_del_entry(entry); INIT_LIST_HEAD(entry); } /** * list_move - delete from one list and add as another's head * @list: the entry to move * @head: the head that will precede our entry */ static inline void list_move(struct list_head *list, struct list_head *head) { __list_del_entry(list); list_add(list, head); } /** * list_move_tail - delete from one list and add as another's tail * @list: the entry to move * @head: the head that will follow our entry */ static inline void list_move_tail(struct list_head *list, struct list_head *head) { __list_del_entry(list); list_add_tail(list, head); } /** * list_is_last - tests whether @list is the last entry in list @head * @list: the entry to test * @head: the head of the list */ static inline int list_is_last(const struct list_head *list, const struct list_head *head) { return list->next == head; } /** * list_empty - tests whether a list is empty * @head: the list to test. */ static inline int list_empty(const struct list_head *head) { return READ_ONCE(head->next) == head; } /** * list_empty_careful - tests whether a list is empty and not being modified * @head: the list to test * * Description: * tests whether a list is empty _and_ checks that no other CPU might be * in the process of modifying either member (next or prev) * * NOTE: using list_empty_careful() without synchronization * can only be safe if the only activity that can happen * to the list entry is list_del_init(). Eg. it cannot be used * if another CPU could re-list_add() it. */ static inline int list_empty_careful(const struct list_head *head) { struct list_head *next = head->next; return (next == head) && (next == head->prev); } /** * list_rotate_left - rotate the list to the left * @head: the head of the list */ static inline void list_rotate_left(struct list_head *head) { struct list_head *first; if (!list_empty(head)) { first = head->next; list_move_tail(first, head); } } /** * list_is_singular - tests whether a list has just one entry. * @head: the list to test. */ static inline int list_is_singular(const struct list_head *head) { return !list_empty(head) && (head->next == head->prev); } static inline void __list_cut_position(struct list_head *list, struct list_head *head, struct list_head *entry) { struct list_head *new_first = entry->next; list->next = head->next; list->next->prev = list; list->prev = entry; entry->next = list; head->next = new_first; new_first->prev = head; } /** * list_cut_position - cut a list into two * @list: a new list to add all removed entries * @head: a list with entries * @entry: an entry within head, could be the head itself * and if so we won't cut the list * * This helper moves the initial part of @head, up to and * including @entry, from @head to @list. You should * pass on @entry an element you know is on @head. @list * should be an empty list or a list you do not care about * losing its data. * */ static inline void list_cut_position(struct list_head *list, struct list_head *head, struct list_head *entry) { if (list_empty(head)) return; if (list_is_singular(head) && (head->next != entry && head != entry)) return; if (entry == head) INIT_LIST_HEAD(list); else __list_cut_position(list, head, entry); } /** * list_cut_before - cut a list into two, before given entry * @list: a new list to add all removed entries * @head: a list with entries * @entry: an entry within head, could be the head itself * * This helper moves the initial part of @head, up to but * excluding @entry, from @head to @list. You should pass * in @entry an element you know is on @head. @list should * be an empty list or a list you do not care about losing * its data. * If @entry == @head, all entries on @head are moved to * @list. */ static inline void list_cut_before(struct list_head *list, struct list_head *head, struct list_head *entry) { if (head->next == entry) { INIT_LIST_HEAD(list); return; } list->next = head->next; list->next->prev = list; list->prev = entry->prev; list->prev->next = list; head->next = entry; entry->prev = head; } static inline void __list_splice(const struct list_head *list, struct list_head *prev, struct list_head *next) { struct list_head *first = list->next; struct list_head *last = list->prev; first->prev = prev; prev->next = first; last->next = next; next->prev = last; } /** * list_splice - join two lists, this is designed for stacks * @list: the new list to add. * @head: the place to add it in the first list. */ static inline void list_splice(const struct list_head *list, struct list_head *head) { if (!list_empty(list)) __list_splice(list, head, head->next); } /** * list_splice_tail - join two lists, each list being a queue * @list: the new list to add. * @head: the place to add it in the first list. */ static inline void list_splice_tail(struct list_head *list, struct list_head *head) { if (!list_empty(list)) __list_splice(list, head->prev, head); } /** * list_splice_init - join two lists and reinitialise the emptied list. * @list: the new list to add. * @head: the place to add it in the first list. * * The list at @list is reinitialised */ static inline void list_splice_init(struct list_head *list, struct list_head *head) { if (!list_empty(list)) { __list_splice(list, head, head->next); INIT_LIST_HEAD(list); } } /** * list_splice_tail_init - join two lists and reinitialise the emptied list * @list: the new list to add. * @head: the place to add it in the first list. * * Each of the lists is a queue. * The list at @list is reinitialised */ static inline void list_splice_tail_init(struct list_head *list, struct list_head *head) { if (!list_empty(list)) { __list_splice(list, head->prev, head); INIT_LIST_HEAD(list); } } /** * list_entry - get the struct for this entry * @ptr: the &struct list_head pointer. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. */ #define list_entry(ptr, type, member) \ container_of(ptr, type, member) /** * list_first_entry - get the first element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note, that list is expected to be not empty. */ #define list_first_entry(ptr, type, member) \ list_entry((ptr)->next, type, member) /** * list_last_entry - get the last element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note, that list is expected to be not empty. */ #define list_last_entry(ptr, type, member) \ list_entry((ptr)->prev, type, member) /** * list_first_entry_or_null - get the first element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note that if the list is empty, it returns NULL. */ #define list_first_entry_or_null(ptr, type, member) ({ \ struct list_head *head__ = (ptr); \ struct list_head *pos__ = READ_ONCE(head__->next); \ pos__ != head__ ? list_entry(pos__, type, member) : NULL; \ }) /** * list_next_entry - get the next element in list * @pos: the type * to cursor * @member: the name of the list_head within the struct. */ #define list_next_entry(pos, member) \ list_entry((pos)->member.next, typeof(*(pos)), member) /** * list_prev_entry - get the prev element in list * @pos: the type * to cursor * @member: the name of the list_head within the struct. */ #define list_prev_entry(pos, member) \ list_entry((pos)->member.prev, typeof(*(pos)), member) /** * list_for_each - iterate over a list * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each(pos, head) \ for (pos = (head)->next; pos != (head); pos = pos->next) /** * list_for_each_prev - iterate over a list backwards * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each_prev(pos, head) \ for (pos = (head)->prev; pos != (head); pos = pos->prev) /** * list_for_each_safe - iterate over a list safe against removal of list entry * @pos: the &struct list_head to use as a loop cursor. * @n: another &struct list_head to use as temporary storage * @head: the head for your list. */ #define list_for_each_safe(pos, n, head) \ for (pos = (head)->next, n = pos->next; pos != (head); \ pos = n, n = pos->next) /** * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry * @pos: the &struct list_head to use as a loop cursor. * @n: another &struct list_head to use as temporary storage * @head: the head for your list. */ #define list_for_each_prev_safe(pos, n, head) \ for (pos = (head)->prev, n = pos->prev; \ pos != (head); \ pos = n, n = pos->prev) /** * list_entry_is_head - test if the entry points to the head of the list * @pos: the type * to cursor * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_entry_is_head(pos, head, member) \ (&pos->member == (head)) /** * list_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_for_each_entry(pos, head, member) \ for (pos = list_first_entry(head, typeof(*pos), member); \ !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** * list_for_each_entry_reverse - iterate backwards over list of given type. * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_for_each_entry_reverse(pos, head, member) \ for (pos = list_last_entry(head, typeof(*pos), member); \ !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue() * @pos: the type * to use as a start point * @head: the head of the list * @member: the name of the list_head within the struct. * * Prepares a pos entry for use as a start point in list_for_each_entry_continue(). */ #define list_prepare_entry(pos, head, member) \ ((pos) ? : list_entry(head, typeof(*pos), member)) /** * list_for_each_entry_continue - continue iteration over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Continue to iterate over list of given type, continuing after * the current position. */ #define list_for_each_entry_continue(pos, head, member) \ for (pos = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** * list_for_each_entry_continue_reverse - iterate backwards from the given point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Start to iterate over list of given type backwards, continuing after * the current position. */ #define list_for_each_entry_continue_reverse(pos, head, member) \ for (pos = list_prev_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** * list_for_each_entry_from - iterate over list of given type from the current point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate over list of given type, continuing from current position. */ #define list_for_each_entry_from(pos, head, member) \ for (; !list_entry_is_head(pos, head, member); \ pos = list_next_entry(pos, member)) /** * list_for_each_entry_from_reverse - iterate backwards over list of given type * from the current point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate backwards over list of given type, continuing from current position. */ #define list_for_each_entry_from_reverse(pos, head, member) \ for (; !list_entry_is_head(pos, head, member); \ pos = list_prev_entry(pos, member)) /** * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. */ #define list_for_each_entry_safe(pos, n, head, member) \ for (pos = list_first_entry(head, typeof(*pos), member), \ n = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_continue - continue list iteration safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate over list of given type, continuing after current point, * safe against removal of list entry. */ #define list_for_each_entry_safe_continue(pos, n, head, member) \ for (pos = list_next_entry(pos, member), \ n = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_from - iterate over list from current point safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate over list of given type from current point, safe against * removal of list entry. */ #define list_for_each_entry_safe_from(pos, n, head, member) \ for (n = list_next_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_reverse - iterate backwards over list safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_head within the struct. * * Iterate backwards over list of given type, safe against removal * of list entry. */ #define list_for_each_entry_safe_reverse(pos, n, head, member) \ for (pos = list_last_entry(head, typeof(*pos), member), \ n = list_prev_entry(pos, member); \ !list_entry_is_head(pos, head, member); \ pos = n, n = list_prev_entry(n, member)) /** * list_safe_reset_next - reset a stale list_for_each_entry_safe loop * @pos: the loop cursor used in the list_for_each_entry_safe loop * @n: temporary storage used in list_for_each_entry_safe * @member: the name of the list_head within the struct. * * list_safe_reset_next is not safe to use in general if the list may be * modified concurrently (eg. the lock is dropped in the loop body). An * exception to this is if the cursor element (pos) is pinned in the list, * and list_safe_reset_next is called after re-taking the lock and before * completing the current iteration of the loop body. */ #define list_safe_reset_next(pos, n, member) \ n = list_next_entry(pos, member) /* * Double linked lists with a single pointer list head. * Mostly useful for hash tables where the two pointer list head is * too wasteful. * You lose the ability to access the tail in O(1). */ #define HLIST_HEAD_INIT { .first = NULL } #define HLIST_HEAD(name) struct hlist_head name = { .first = NULL } #define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL) static inline void INIT_HLIST_NODE(struct hlist_node *h) { h->next = NULL; h->pprev = NULL; } static inline int hlist_unhashed(const struct hlist_node *h) { return !h->pprev; } static inline int hlist_empty(const struct hlist_head *h) { return !READ_ONCE(h->first); } static inline void __hlist_del(struct hlist_node *n) { struct hlist_node *next = n->next; struct hlist_node **pprev = n->pprev; WRITE_ONCE(*pprev, next); if (next) next->pprev = pprev; } static inline void hlist_del(struct hlist_node *n) { __hlist_del(n); n->next = LIST_POISON1; n->pprev = LIST_POISON2; } static inline void hlist_del_init(struct hlist_node *n) { if (!hlist_unhashed(n)) { __hlist_del(n); INIT_HLIST_NODE(n); } } static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) { struct hlist_node *first = h->first; n->next = first; if (first) first->pprev = &n->next; WRITE_ONCE(h->first, n); n->pprev = &h->first; } /* next must be != NULL */ static inline void hlist_add_before(struct hlist_node *n, struct hlist_node *next) { n->pprev = next->pprev; n->next = next; next->pprev = &n->next; WRITE_ONCE(*(n->pprev), n); } static inline void hlist_add_behind(struct hlist_node *n, struct hlist_node *prev) { n->next = prev->next; WRITE_ONCE(prev->next, n); n->pprev = &prev->next; if (n->next) n->next->pprev = &n->next; } /* after that we'll appear to be on some hlist and hlist_del will work */ static inline void hlist_add_fake(struct hlist_node *n) { n->pprev = &n->next; } static inline bool hlist_fake(struct hlist_node *h) { return h->pprev == &h->next; } /* * Check whether the node is the only node of the head without * accessing head: */ static inline bool hlist_is_singular_node(struct hlist_node *n, struct hlist_head *h) { return !n->next && n->pprev == &h->first; } /* * Move a list from one list head to another. Fixup the pprev * reference of the first entry if it exists. */ static inline void hlist_move_list(struct hlist_head *old, struct hlist_head *new) { new->first = old->first; if (new->first) new->first->pprev = &new->first; old->first = NULL; } #define hlist_entry(ptr, type, member) container_of(ptr,type,member) #define hlist_for_each(pos, head) \ for (pos = (head)->first; pos ; pos = pos->next) #define hlist_for_each_safe(pos, n, head) \ for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \ pos = n) #define hlist_entry_safe(ptr, type, member) \ ({ typeof(ptr) ____ptr = (ptr); \ ____ptr ? hlist_entry(____ptr, type, member) : NULL; \ }) /** * hlist_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry(pos, head, member) \ for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);\ pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_continue - iterate over a hlist continuing after current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_continue(pos, member) \ for (pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member);\ pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_from - iterate over a hlist continuing from current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_from(pos, member) \ for (; pos; \ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry * @pos: the type * to use as a loop cursor. * @n: another &struct hlist_node to use as temporary storage * @head: the head for your list. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_safe(pos, n, head, member) \ for (pos = hlist_entry_safe((head)->first, typeof(*pos), member);\ pos && ({ n = pos->member.next; 1; }); \ pos = hlist_entry_safe(n, typeof(*pos), member)) #endif |
83 80 82 80 375 375 374 374 10 91 70 70 70 69 69 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 | /* * Support KVM gust page tracking * * This feature allows us to track page access in guest. Currently, only * write access is tracked. * * Copyright(C) 2015 Intel Corporation. * * Author: * Xiao Guangrong <guangrong.xiao@linux.intel.com> * * This work is licensed under the terms of the GNU GPL, version 2. See * the COPYING file in the top-level directory. */ #include <linux/kvm_host.h> #include <linux/rculist.h> #include <asm/kvm_host.h> #include <asm/kvm_page_track.h> #include "mmu.h" void kvm_page_track_free_memslot(struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { int i; for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) if (!dont || free->arch.gfn_track[i] != dont->arch.gfn_track[i]) { kvfree(free->arch.gfn_track[i]); free->arch.gfn_track[i] = NULL; } } int kvm_page_track_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) { int i; for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) { slot->arch.gfn_track[i] = kvcalloc(npages, sizeof(*slot->arch.gfn_track[i]), GFP_KERNEL); if (!slot->arch.gfn_track[i]) goto track_free; } return 0; track_free: kvm_page_track_free_memslot(slot, NULL); return -ENOMEM; } static inline bool page_track_mode_is_valid(enum kvm_page_track_mode mode) { if (mode < 0 || mode >= KVM_PAGE_TRACK_MAX) return false; return true; } static void update_gfn_track(struct kvm_memory_slot *slot, gfn_t gfn, enum kvm_page_track_mode mode, short count) { int index, val; index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL); val = slot->arch.gfn_track[mode][index]; if (WARN_ON(val + count < 0 || val + count > USHRT_MAX)) return; slot->arch.gfn_track[mode][index] += count; } /* * add guest page to the tracking pool so that corresponding access on that * page will be intercepted. * * It should be called under the protection both of mmu-lock and kvm->srcu * or kvm->slots_lock. * * @kvm: the guest instance we are interested in. * @slot: the @gfn belongs to. * @gfn: the guest page. * @mode: tracking mode, currently only write track is supported. */ void kvm_slot_page_track_add_page(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, enum kvm_page_track_mode mode) { if (WARN_ON(!page_track_mode_is_valid(mode))) return; update_gfn_track(slot, gfn, mode, 1); /* * new track stops large page mapping for the * tracked page. */ kvm_mmu_gfn_disallow_lpage(slot, gfn); if (mode == KVM_PAGE_TRACK_WRITE) if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn)) kvm_flush_remote_tlbs(kvm); } EXPORT_SYMBOL_GPL(kvm_slot_page_track_add_page); /* * remove the guest page from the tracking pool which stops the interception * of corresponding access on that page. It is the opposed operation of * kvm_slot_page_track_add_page(). * * It should be called under the protection both of mmu-lock and kvm->srcu * or kvm->slots_lock. * * @kvm: the guest instance we are interested in. * @slot: the @gfn belongs to. * @gfn: the guest page. * @mode: tracking mode, currently only write track is supported. */ void kvm_slot_page_track_remove_page(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, enum kvm_page_track_mode mode) { if (WARN_ON(!page_track_mode_is_valid(mode))) return; update_gfn_track(slot, gfn, mode, -1); /* * allow large page mapping for the tracked page * after the tracker is gone. */ kvm_mmu_gfn_allow_lpage(slot, gfn); } EXPORT_SYMBOL_GPL(kvm_slot_page_track_remove_page); /* * check if the corresponding access on the specified guest page is tracked. */ bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn, enum kvm_page_track_mode mode) { struct kvm_memory_slot *slot; int index; if (WARN_ON(!page_track_mode_is_valid(mode))) return false; slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); if (!slot) return false; index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL); return !!READ_ONCE(slot->arch.gfn_track[mode][index]); } void kvm_page_track_cleanup(struct kvm *kvm) { struct kvm_page_track_notifier_head *head; head = &kvm->arch.track_notifier_head; cleanup_srcu_struct(&head->track_srcu); } void kvm_page_track_init(struct kvm *kvm) { struct kvm_page_track_notifier_head *head; head = &kvm->arch.track_notifier_head; init_srcu_struct(&head->track_srcu); INIT_HLIST_HEAD(&head->track_notifier_list); } /* * register the notifier so that event interception for the tracked guest * pages can be received. */ void kvm_page_track_register_notifier(struct kvm *kvm, struct kvm_page_track_notifier_node *n) { struct kvm_page_track_notifier_head *head; head = &kvm->arch.track_notifier_head; spin_lock(&kvm->mmu_lock); hlist_add_head_rcu(&n->node, &head->track_notifier_list); spin_unlock(&kvm->mmu_lock); } EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier); /* * stop receiving the event interception. It is the opposed operation of * kvm_page_track_register_notifier(). */ void kvm_page_track_unregister_notifier(struct kvm *kvm, struct kvm_page_track_notifier_node *n) { struct kvm_page_track_notifier_head *head; head = &kvm->arch.track_notifier_head; spin_lock(&kvm->mmu_lock); hlist_del_rcu(&n->node); spin_unlock(&kvm->mmu_lock); synchronize_srcu(&head->track_srcu); } EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier); /* * Notify the node that write access is intercepted and write emulation is * finished at this time. * * The node should figure out if the written page is the one that node is * interested in by itself. */ void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes) { struct kvm_page_track_notifier_head *head; struct kvm_page_track_notifier_node *n; int idx; head = &vcpu->kvm->arch.track_notifier_head; if (hlist_empty(&head->track_notifier_list)) return; idx = srcu_read_lock(&head->track_srcu); hlist_for_each_entry_rcu(n, &head->track_notifier_list, node) if (n->track_write) n->track_write(vcpu, gpa, new, bytes, n); srcu_read_unlock(&head->track_srcu, idx); } /* * Notify the node that memory slot is being removed or moved so that it can * drop write-protection for the pages in the memory slot. * * The node should figure out it has any write-protected pages in this slot * by itself. */ void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot) { struct kvm_page_track_notifier_head *head; struct kvm_page_track_notifier_node *n; int idx; head = &kvm->arch.track_notifier_head; if (hlist_empty(&head->track_notifier_list)) return; idx = srcu_read_lock(&head->track_srcu); hlist_for_each_entry_rcu(n, &head->track_notifier_list, node) if (n->track_flush_slot) n->track_flush_slot(kvm, slot, n); srcu_read_unlock(&head->track_srcu, idx); } |
1 34 29 2 2 2 2 2 2 2 1 2 2 2 2 2 1 2 48 48 5 36 36 15 2 2 2 2 2 2 1 1 1 3 1 1 8 8 8 8 11 11 11 11 1 11 32 1 32 36 4 32 1 31 31 31 31 31 3 5 36 36 38 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 | /* * linux/fs/hfs/super.c * * Copyright (C) 1995-1997 Paul H. Hargrove * (C) 2003 Ardis Technologies <roman@ardistech.com> * This file may be distributed under the terms of the GNU General Public License. * * This file contains hfs_read_super(), some of the super_ops and * init_hfs_fs() and exit_hfs_fs(). The remaining super_ops are in * inode.c since they deal with inodes. * * Based on the minix file system code, (C) 1991, 1992 by Linus Torvalds */ #include <linux/module.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/mount.h> #include <linux/init.h> #include <linux/nls.h> #include <linux/parser.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/vfs.h> #include "hfs_fs.h" #include "btree.h" static struct kmem_cache *hfs_inode_cachep; MODULE_LICENSE("GPL"); static int hfs_sync_fs(struct super_block *sb, int wait) { hfs_mdb_commit(sb); return 0; } /* * hfs_put_super() * * This is the put_super() entry in the super_operations structure for * HFS filesystems. The purpose is to release the resources * associated with the superblock sb. */ static void hfs_put_super(struct super_block *sb) { cancel_delayed_work_sync(&HFS_SB(sb)->mdb_work); hfs_mdb_close(sb); /* release the MDB's resources */ hfs_mdb_put(sb); } static void flush_mdb(struct work_struct *work) { struct hfs_sb_info *sbi; struct super_block *sb; sbi = container_of(work, struct hfs_sb_info, mdb_work.work); sb = sbi->sb; spin_lock(&sbi->work_lock); sbi->work_queued = 0; spin_unlock(&sbi->work_lock); hfs_mdb_commit(sb); } void hfs_mark_mdb_dirty(struct super_block *sb) { struct hfs_sb_info *sbi = HFS_SB(sb); unsigned long delay; if (sb_rdonly(sb)) return; spin_lock(&sbi->work_lock); if (!sbi->work_queued) { delay = msecs_to_jiffies(dirty_writeback_interval * 10); queue_delayed_work(system_long_wq, &sbi->mdb_work, delay); sbi->work_queued = 1; } spin_unlock(&sbi->work_lock); } /* * hfs_statfs() * * This is the statfs() entry in the super_operations structure for * HFS filesystems. The purpose is to return various data about the * filesystem. * * changed f_files/f_ffree to reflect the fs_ablock/free_ablocks. */ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; u64 id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = HFS_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_blocks = (u32)HFS_SB(sb)->fs_ablocks * HFS_SB(sb)->fs_div; buf->f_bfree = (u32)HFS_SB(sb)->free_ablocks * HFS_SB(sb)->fs_div; buf->f_bavail = buf->f_bfree; buf->f_files = HFS_SB(sb)->fs_ablocks; buf->f_ffree = HFS_SB(sb)->free_ablocks; buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); buf->f_namelen = HFS_NAMELEN; return 0; } static int hfs_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); *flags |= SB_NODIRATIME; if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) return 0; if (!(*flags & SB_RDONLY)) { if (!(HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) { pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended. leaving read-only.\n"); sb->s_flags |= SB_RDONLY; *flags |= SB_RDONLY; } else if (HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_SLOCK)) { pr_warn("filesystem is marked locked, leaving read-only.\n"); sb->s_flags |= SB_RDONLY; *flags |= SB_RDONLY; } } return 0; } static int hfs_show_options(struct seq_file *seq, struct dentry *root) { struct hfs_sb_info *sbi = HFS_SB(root->d_sb); if (sbi->s_creator != cpu_to_be32(0x3f3f3f3f)) seq_show_option_n(seq, "creator", (char *)&sbi->s_creator, 4); if (sbi->s_type != cpu_to_be32(0x3f3f3f3f)) seq_show_option_n(seq, "type", (char *)&sbi->s_type, 4); seq_printf(seq, ",uid=%u,gid=%u", from_kuid_munged(&init_user_ns, sbi->s_uid), from_kgid_munged(&init_user_ns, sbi->s_gid)); if (sbi->s_file_umask != 0133) seq_printf(seq, ",file_umask=%o", sbi->s_file_umask); if (sbi->s_dir_umask != 0022) seq_printf(seq, ",dir_umask=%o", sbi->s_dir_umask); if (sbi->part >= 0) seq_printf(seq, ",part=%u", sbi->part); if (sbi->session >= 0) seq_printf(seq, ",session=%u", sbi->session); if (sbi->nls_disk) seq_printf(seq, ",codepage=%s", sbi->nls_disk->charset); if (sbi->nls_io) seq_printf(seq, ",iocharset=%s", sbi->nls_io->charset); if (sbi->s_quiet) seq_printf(seq, ",quiet"); return 0; } static struct inode *hfs_alloc_inode(struct super_block *sb) { struct hfs_inode_info *i; i = kmem_cache_alloc(hfs_inode_cachep, GFP_KERNEL); return i ? &i->vfs_inode : NULL; } static void hfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(hfs_inode_cachep, HFS_I(inode)); } static void hfs_destroy_inode(struct inode *inode) { call_rcu(&inode->i_rcu, hfs_i_callback); } static const struct super_operations hfs_super_operations = { .alloc_inode = hfs_alloc_inode, .destroy_inode = hfs_destroy_inode, .write_inode = hfs_write_inode, .evict_inode = hfs_evict_inode, .put_super = hfs_put_super, .sync_fs = hfs_sync_fs, .statfs = hfs_statfs, .remount_fs = hfs_remount, .show_options = hfs_show_options, }; enum { opt_uid, opt_gid, opt_umask, opt_file_umask, opt_dir_umask, opt_part, opt_session, opt_type, opt_creator, opt_quiet, opt_codepage, opt_iocharset, opt_err }; static const match_table_t tokens = { { opt_uid, "uid=%u" }, { opt_gid, "gid=%u" }, { opt_umask, "umask=%o" }, { opt_file_umask, "file_umask=%o" }, { opt_dir_umask, "dir_umask=%o" }, { opt_part, "part=%u" }, { opt_session, "session=%u" }, { opt_type, "type=%s" }, { opt_creator, "creator=%s" }, { opt_quiet, "quiet" }, { opt_codepage, "codepage=%s" }, { opt_iocharset, "iocharset=%s" }, { opt_err, NULL } }; static inline int match_fourchar(substring_t *arg, u32 *result) { if (arg->to - arg->from != 4) return -EINVAL; memcpy(result, arg->from, 4); return 0; } /* * parse_options() * * adapted from linux/fs/msdos/inode.c written 1992,93 by Werner Almesberger * This function is called by hfs_read_super() to parse the mount options. */ static int parse_options(char *options, struct hfs_sb_info *hsb) { char *p; substring_t args[MAX_OPT_ARGS]; int tmp, token; /* initialize the sb with defaults */ hsb->s_uid = current_uid(); hsb->s_gid = current_gid(); hsb->s_file_umask = 0133; hsb->s_dir_umask = 0022; hsb->s_type = hsb->s_creator = cpu_to_be32(0x3f3f3f3f); /* == '????' */ hsb->s_quiet = 0; hsb->part = -1; hsb->session = -1; if (!options) return 1; while ((p = strsep(&options, ",")) != NULL) { if (!*p) continue; token = match_token(p, tokens, args); switch (token) { case opt_uid: if (match_int(&args[0], &tmp)) { pr_err("uid requires an argument\n"); return 0; } hsb->s_uid = make_kuid(current_user_ns(), (uid_t)tmp); if (!uid_valid(hsb->s_uid)) { pr_err("invalid uid %d\n", tmp); return 0; } break; case opt_gid: if (match_int(&args[0], &tmp)) { pr_err("gid requires an argument\n"); return 0; } hsb->s_gid = make_kgid(current_user_ns(), (gid_t)tmp); if (!gid_valid(hsb->s_gid)) { pr_err("invalid gid %d\n", tmp); return 0; } break; case opt_umask: if (match_octal(&args[0], &tmp)) { pr_err("umask requires a value\n"); return 0; } hsb->s_file_umask = (umode_t)tmp; hsb->s_dir_umask = (umode_t)tmp; break; case opt_file_umask: if (match_octal(&args[0], &tmp)) { pr_err("file_umask requires a value\n"); return 0; } hsb->s_file_umask = (umode_t)tmp; break; case opt_dir_umask: if (match_octal(&args[0], &tmp)) { pr_err("dir_umask requires a value\n"); return 0; } hsb->s_dir_umask = (umode_t)tmp; break; case opt_part: if (match_int(&args[0], &hsb->part)) { pr_err("part requires an argument\n"); return 0; } break; case opt_session: if (match_int(&args[0], &hsb->session)) { pr_err("session requires an argument\n"); return 0; } break; case opt_type: if (match_fourchar(&args[0], &hsb->s_type)) { pr_err("type requires a 4 character value\n"); return 0; } break; case opt_creator: if (match_fourchar(&args[0], &hsb->s_creator)) { pr_err("creator requires a 4 character value\n"); return 0; } break; case opt_quiet: hsb->s_quiet = 1; break; case opt_codepage: if (hsb->nls_disk) { pr_err("unable to change codepage\n"); return 0; } p = match_strdup(&args[0]); if (p) hsb->nls_disk = load_nls(p); if (!hsb->nls_disk) { pr_err("unable to load codepage \"%s\"\n", p); kfree(p); return 0; } kfree(p); break; case opt_iocharset: if (hsb->nls_io) { pr_err("unable to change iocharset\n"); return 0; } p = match_strdup(&args[0]); if (p) hsb->nls_io = load_nls(p); if (!hsb->nls_io) { pr_err("unable to load iocharset \"%s\"\n", p); kfree(p); return 0; } kfree(p); break; default: return 0; } } if (hsb->nls_disk && !hsb->nls_io) { hsb->nls_io = load_nls_default(); if (!hsb->nls_io) { pr_err("unable to load default iocharset\n"); return 0; } } hsb->s_dir_umask &= 0777; hsb->s_file_umask &= 0577; return 1; } /* * hfs_read_super() * * This is the function that is responsible for mounting an HFS * filesystem. It performs all the tasks necessary to get enough data * from the disk to read the root inode. This includes parsing the * mount options, dealing with Macintosh partitions, reading the * superblock and the allocation bitmap blocks, calling * hfs_btree_init() to get the necessary data about the extents and * catalog B-trees and, finally, reading the root inode into memory. */ static int hfs_fill_super(struct super_block *sb, void *data, int silent) { struct hfs_sb_info *sbi; struct hfs_find_data fd; hfs_cat_rec rec; struct inode *root_inode; int res; sbi = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL); if (!sbi) return -ENOMEM; sbi->sb = sb; sb->s_fs_info = sbi; spin_lock_init(&sbi->work_lock); INIT_DELAYED_WORK(&sbi->mdb_work, flush_mdb); res = -EINVAL; if (!parse_options((char *)data, sbi)) { pr_err("unable to parse mount options\n"); goto bail; } sb->s_op = &hfs_super_operations; sb->s_xattr = hfs_xattr_handlers; sb->s_flags |= SB_NODIRATIME; mutex_init(&sbi->bitmap_lock); res = hfs_mdb_get(sb); if (res) { if (!silent) pr_warn("can't find a HFS filesystem on dev %s\n", hfs_mdb_name(sb)); res = -EINVAL; goto bail; } /* try to get the root inode */ res = hfs_find_init(HFS_SB(sb)->cat_tree, &fd); if (res) goto bail_no_root; res = hfs_cat_find_brec(sb, HFS_ROOT_CNID, &fd); if (!res) { if (fd.entrylength > sizeof(rec) || fd.entrylength < 0) { res = -EIO; goto bail_hfs_find; } hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, fd.entrylength); } if (res) goto bail_hfs_find; res = -EINVAL; root_inode = hfs_iget(sb, &fd.search_key->cat, &rec); hfs_find_exit(&fd); if (!root_inode) goto bail_no_root; sb->s_d_op = &hfs_dentry_operations; res = -ENOMEM; sb->s_root = d_make_root(root_inode); if (!sb->s_root) goto bail_no_root; /* everything's okay */ return 0; bail_hfs_find: hfs_find_exit(&fd); bail_no_root: pr_err("get root inode failed\n"); bail: hfs_mdb_put(sb); return res; } static struct dentry *hfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { return mount_bdev(fs_type, flags, dev_name, data, hfs_fill_super); } static struct file_system_type hfs_fs_type = { .owner = THIS_MODULE, .name = "hfs", .mount = hfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("hfs"); static void hfs_init_once(void *p) { struct hfs_inode_info *i = p; inode_init_once(&i->vfs_inode); } static int __init init_hfs_fs(void) { int err; hfs_inode_cachep = kmem_cache_create("hfs_inode_cache", sizeof(struct hfs_inode_info), 0, SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, hfs_init_once); if (!hfs_inode_cachep) return -ENOMEM; err = register_filesystem(&hfs_fs_type); if (err) kmem_cache_destroy(hfs_inode_cachep); return err; } static void __exit exit_hfs_fs(void) { unregister_filesystem(&hfs_fs_type); /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(hfs_inode_cachep); } module_init(init_hfs_fs) module_exit(exit_hfs_fs) |
5 5 5 5 5 5 4 4 4 4 4 5 3 3 2 2 2 2 4 6 3 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 | /* binder.c * * Android IPC Subsystem * * Copyright (C) 2007-2008 Google, Inc. * * This software is licensed under the terms of the GNU General Public * License version 2, as published by the Free Software Foundation, and * may be copied, distributed, and modified under those terms. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * */ /* * Locking overview * * There are 3 main spinlocks which must be acquired in the * order shown: * * 1) proc->outer_lock : protects binder_ref * binder_proc_lock() and binder_proc_unlock() are * used to acq/rel. * 2) node->lock : protects most fields of binder_node. * binder_node_lock() and binder_node_unlock() are * used to acq/rel * 3) proc->inner_lock : protects the thread and node lists * (proc->threads, proc->waiting_threads, proc->nodes) * and all todo lists associated with the binder_proc * (proc->todo, thread->todo, proc->delivered_death and * node->async_todo), as well as thread->transaction_stack * binder_inner_proc_lock() and binder_inner_proc_unlock() * are used to acq/rel * * Any lock under procA must never be nested under any lock at the same * level or below on procB. * * Functions that require a lock held on entry indicate which lock * in the suffix of the function name: * * foo_olocked() : requires node->outer_lock * foo_nlocked() : requires node->lock * foo_ilocked() : requires proc->inner_lock * foo_oilocked(): requires proc->outer_lock and proc->inner_lock * foo_nilocked(): requires node->lock and proc->inner_lock * ... */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/fdtable.h> #include <linux/file.h> #include <linux/freezer.h> #include <linux/fs.h> #include <linux/list.h> #include <linux/miscdevice.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/nsproxy.h> #include <linux/poll.h> #include <linux/debugfs.h> #include <linux/rbtree.h> #include <linux/sched/signal.h> #include <linux/sched/mm.h> #include <linux/seq_file.h> #include <linux/uaccess.h> #include <linux/pid_namespace.h> #include <linux/security.h> #include <linux/spinlock.h> #include <linux/ratelimit.h> #include <uapi/linux/android/binder.h> #include <asm/cacheflush.h> #include "binder_alloc.h" #include "binder_trace.h" static HLIST_HEAD(binder_deferred_list); static DEFINE_MUTEX(binder_deferred_lock); static HLIST_HEAD(binder_devices); static HLIST_HEAD(binder_procs); static DEFINE_MUTEX(binder_procs_lock); static HLIST_HEAD(binder_dead_nodes); static DEFINE_SPINLOCK(binder_dead_nodes_lock); static struct dentry *binder_debugfs_dir_entry_root; static struct dentry *binder_debugfs_dir_entry_proc; static atomic_t binder_last_id; #define BINDER_DEBUG_ENTRY(name) \ static int binder_##name##_open(struct inode *inode, struct file *file) \ { \ return single_open(file, binder_##name##_show, inode->i_private); \ } \ \ static const struct file_operations binder_##name##_fops = { \ .owner = THIS_MODULE, \ .open = binder_##name##_open, \ .read = seq_read, \ .llseek = seq_lseek, \ .release = single_release, \ } static int binder_proc_show(struct seq_file *m, void *unused); BINDER_DEBUG_ENTRY(proc); /* This is only defined in include/asm-arm/sizes.h */ #ifndef SZ_1K #define SZ_1K 0x400 #endif #ifndef SZ_4M #define SZ_4M 0x400000 #endif #define FORBIDDEN_MMAP_FLAGS (VM_WRITE) enum { BINDER_DEBUG_USER_ERROR = 1U << 0, BINDER_DEBUG_FAILED_TRANSACTION = 1U << 1, BINDER_DEBUG_DEAD_TRANSACTION = 1U << 2, BINDER_DEBUG_OPEN_CLOSE = 1U << 3, BINDER_DEBUG_DEAD_BINDER = 1U << 4, BINDER_DEBUG_DEATH_NOTIFICATION = 1U << 5, BINDER_DEBUG_READ_WRITE = 1U << 6, BINDER_DEBUG_USER_REFS = 1U << 7, BINDER_DEBUG_THREADS = 1U << 8, BINDER_DEBUG_TRANSACTION = 1U << 9, BINDER_DEBUG_TRANSACTION_COMPLETE = 1U << 10, BINDER_DEBUG_FREE_BUFFER = 1U << 11, BINDER_DEBUG_INTERNAL_REFS = 1U << 12, BINDER_DEBUG_PRIORITY_CAP = 1U << 13, BINDER_DEBUG_SPINLOCKS = 1U << 14, }; static uint32_t binder_debug_mask = BINDER_DEBUG_USER_ERROR | BINDER_DEBUG_FAILED_TRANSACTION | BINDER_DEBUG_DEAD_TRANSACTION; module_param_named(debug_mask, binder_debug_mask, uint, 0644); static char *binder_devices_param = CONFIG_ANDROID_BINDER_DEVICES; module_param_named(devices, binder_devices_param, charp, 0444); static DECLARE_WAIT_QUEUE_HEAD(binder_user_error_wait); static int binder_stop_on_user_error; static int binder_set_stop_on_user_error(const char *val, const struct kernel_param *kp) { int ret; ret = param_set_int(val, kp); if (binder_stop_on_user_error < 2) wake_up(&binder_user_error_wait); return ret; } module_param_call(stop_on_user_error, binder_set_stop_on_user_error, param_get_int, &binder_stop_on_user_error, 0644); #define binder_debug(mask, x...) \ do { \ if (binder_debug_mask & mask) \ pr_info_ratelimited(x); \ } while (0) #define binder_user_error(x...) \ do { \ if (binder_debug_mask & BINDER_DEBUG_USER_ERROR) \ pr_info_ratelimited(x); \ if (binder_stop_on_user_error) \ binder_stop_on_user_error = 2; \ } while (0) #define to_flat_binder_object(hdr) \ container_of(hdr, struct flat_binder_object, hdr) #define to_binder_fd_object(hdr) container_of(hdr, struct binder_fd_object, hdr) #define to_binder_buffer_object(hdr) \ container_of(hdr, struct binder_buffer_object, hdr) #define to_binder_fd_array_object(hdr) \ container_of(hdr, struct binder_fd_array_object, hdr) enum binder_stat_types { BINDER_STAT_PROC, BINDER_STAT_THREAD, BINDER_STAT_NODE, BINDER_STAT_REF, BINDER_STAT_DEATH, BINDER_STAT_TRANSACTION, BINDER_STAT_TRANSACTION_COMPLETE, BINDER_STAT_COUNT }; struct binder_stats { atomic_t br[_IOC_NR(BR_FAILED_REPLY) + 1]; atomic_t bc[_IOC_NR(BC_REPLY_SG) + 1]; atomic_t obj_created[BINDER_STAT_COUNT]; atomic_t obj_deleted[BINDER_STAT_COUNT]; }; static struct binder_stats binder_stats; static inline void binder_stats_deleted(enum binder_stat_types type) { atomic_inc(&binder_stats.obj_deleted[type]); } static inline void binder_stats_created(enum binder_stat_types type) { atomic_inc(&binder_stats.obj_created[type]); } struct binder_transaction_log_entry { int debug_id; int debug_id_done; int call_type; int from_proc; int from_thread; int target_handle; int to_proc; int to_thread; int to_node; int data_size; int offsets_size; int return_error_line; uint32_t return_error; uint32_t return_error_param; const char *context_name; }; struct binder_transaction_log { atomic_t cur; bool full; struct binder_transaction_log_entry entry[32]; }; static struct binder_transaction_log binder_transaction_log; static struct binder_transaction_log binder_transaction_log_failed; static struct binder_transaction_log_entry *binder_transaction_log_add( struct binder_transaction_log *log) { struct binder_transaction_log_entry *e; unsigned int cur = atomic_inc_return(&log->cur); if (cur >= ARRAY_SIZE(log->entry)) log->full = true; e = &log->entry[cur % ARRAY_SIZE(log->entry)]; WRITE_ONCE(e->debug_id_done, 0); /* * write-barrier to synchronize access to e->debug_id_done. * We make sure the initialized 0 value is seen before * memset() other fields are zeroed by memset. */ smp_wmb(); memset(e, 0, sizeof(*e)); return e; } struct binder_context { struct binder_node *binder_context_mgr_node; struct mutex context_mgr_node_lock; kuid_t binder_context_mgr_uid; const char *name; }; struct binder_device { struct hlist_node hlist; struct miscdevice miscdev; struct binder_context context; }; /** * struct binder_work - work enqueued on a worklist * @entry: node enqueued on list * @type: type of work to be performed * * There are separate work lists for proc, thread, and node (async). */ struct binder_work { struct list_head entry; enum binder_work_type { BINDER_WORK_TRANSACTION = 1, BINDER_WORK_TRANSACTION_COMPLETE, BINDER_WORK_RETURN_ERROR, BINDER_WORK_NODE, BINDER_WORK_DEAD_BINDER, BINDER_WORK_DEAD_BINDER_AND_CLEAR, BINDER_WORK_CLEAR_DEATH_NOTIFICATION, } type; }; struct binder_error { struct binder_work work; uint32_t cmd; }; /** * struct binder_node - binder node bookkeeping * @debug_id: unique ID for debugging * (invariant after initialized) * @lock: lock for node fields * @work: worklist element for node work * (protected by @proc->inner_lock) * @rb_node: element for proc->nodes tree * (protected by @proc->inner_lock) * @dead_node: element for binder_dead_nodes list * (protected by binder_dead_nodes_lock) * @proc: binder_proc that owns this node * (invariant after initialized) * @refs: list of references on this node * (protected by @lock) * @internal_strong_refs: used to take strong references when * initiating a transaction * (protected by @proc->inner_lock if @proc * and by @lock) * @local_weak_refs: weak user refs from local process * (protected by @proc->inner_lock if @proc * and by @lock) * @local_strong_refs: strong user refs from local process * (protected by @proc->inner_lock if @proc * and by @lock) * @tmp_refs: temporary kernel refs * (protected by @proc->inner_lock while @proc * is valid, and by binder_dead_nodes_lock * if @proc is NULL. During inc/dec and node release * it is also protected by @lock to provide safety * as the node dies and @proc becomes NULL) * @ptr: userspace pointer for node * (invariant, no lock needed) * @cookie: userspace cookie for node * (invariant, no lock needed) * @has_strong_ref: userspace notified of strong ref * (protected by @proc->inner_lock if @proc * and by @lock) * @pending_strong_ref: userspace has acked notification of strong ref * (protected by @proc->inner_lock if @proc * and by @lock) * @has_weak_ref: userspace notified of weak ref * (protected by @proc->inner_lock if @proc * and by @lock) * @pending_weak_ref: userspace has acked notification of weak ref * (protected by @proc->inner_lock if @proc * and by @lock) * @has_async_transaction: async transaction to node in progress * (protected by @lock) * @accept_fds: file descriptor operations supported for node * (invariant after initialized) * @min_priority: minimum scheduling priority * (invariant after initialized) * @async_todo: list of async work items * (protected by @proc->inner_lock) * * Bookkeeping structure for binder nodes. */ struct binder_node { int debug_id; spinlock_t lock; struct binder_work work; union { struct rb_node rb_node; struct hlist_node dead_node; }; struct binder_proc *proc; struct hlist_head refs; int internal_strong_refs; int local_weak_refs; int local_strong_refs; int tmp_refs; binder_uintptr_t ptr; binder_uintptr_t cookie; struct { /* * bitfield elements protected by * proc inner_lock */ u8 has_strong_ref:1; u8 pending_strong_ref:1; u8 has_weak_ref:1; u8 pending_weak_ref:1; }; struct { /* * invariant after initialization */ u8 accept_fds:1; u8 min_priority; }; bool has_async_transaction; struct list_head async_todo; }; struct binder_ref_death { /** * @work: worklist element for death notifications * (protected by inner_lock of the proc that * this ref belongs to) */ struct binder_work work; binder_uintptr_t cookie; }; /** * struct binder_ref_data - binder_ref counts and id * @debug_id: unique ID for the ref * @desc: unique userspace handle for ref * @strong: strong ref count (debugging only if not locked) * @weak: weak ref count (debugging only if not locked) * * Structure to hold ref count and ref id information. Since * the actual ref can only be accessed with a lock, this structure * is used to return information about the ref to callers of * ref inc/dec functions. */ struct binder_ref_data { int debug_id; uint32_t desc; int strong; int weak; }; /** * struct binder_ref - struct to track references on nodes * @data: binder_ref_data containing id, handle, and current refcounts * @rb_node_desc: node for lookup by @data.desc in proc's rb_tree * @rb_node_node: node for lookup by @node in proc's rb_tree * @node_entry: list entry for node->refs list in target node * (protected by @node->lock) * @proc: binder_proc containing ref * @node: binder_node of target node. When cleaning up a * ref for deletion in binder_cleanup_ref, a non-NULL * @node indicates the node must be freed * @death: pointer to death notification (ref_death) if requested * (protected by @node->lock) * * Structure to track references from procA to target node (on procB). This * structure is unsafe to access without holding @proc->outer_lock. */ struct binder_ref { /* Lookups needed: */ /* node + proc => ref (transaction) */ /* desc + proc => ref (transaction, inc/dec ref) */ /* node => refs + procs (proc exit) */ struct binder_ref_data data; struct rb_node rb_node_desc; struct rb_node rb_node_node; struct hlist_node node_entry; struct binder_proc *proc; struct binder_node *node; struct binder_ref_death *death; }; enum binder_deferred_state { BINDER_DEFERRED_PUT_FILES = 0x01, BINDER_DEFERRED_FLUSH = 0x02, BINDER_DEFERRED_RELEASE = 0x04, }; /** * struct binder_proc - binder process bookkeeping * @proc_node: element for binder_procs list * @threads: rbtree of binder_threads in this proc * (protected by @inner_lock) * @nodes: rbtree of binder nodes associated with * this proc ordered by node->ptr * (protected by @inner_lock) * @refs_by_desc: rbtree of refs ordered by ref->desc * (protected by @outer_lock) * @refs_by_node: rbtree of refs ordered by ref->node * (protected by @outer_lock) * @waiting_threads: threads currently waiting for proc work * (protected by @inner_lock) * @pid PID of group_leader of process * (invariant after initialized) * @tsk task_struct for group_leader of process * (invariant after initialized) * @files files_struct for process * (protected by @files_lock) * @files_lock mutex to protect @files * @cred struct cred associated with the `struct file` * in binder_open() * (invariant after initialized) * @deferred_work_node: element for binder_deferred_list * (protected by binder_deferred_lock) * @deferred_work: bitmap of deferred work to perform * (protected by binder_deferred_lock) * @is_dead: process is dead and awaiting free * when outstanding transactions are cleaned up * (protected by @inner_lock) * @todo: list of work for this process * (protected by @inner_lock) * @stats: per-process binder statistics * (atomics, no lock needed) * @delivered_death: list of delivered death notification * (protected by @inner_lock) * @max_threads: cap on number of binder threads * (protected by @inner_lock) * @requested_threads: number of binder threads requested but not * yet started. In current implementation, can * only be 0 or 1. * (protected by @inner_lock) * @requested_threads_started: number binder threads started * (protected by @inner_lock) * @tmp_ref: temporary reference to indicate proc is in use * (protected by @inner_lock) * @default_priority: default scheduler priority * (invariant after initialized) * @debugfs_entry: debugfs node * @alloc: binder allocator bookkeeping * @context: binder_context for this proc * (invariant after initialized) * @inner_lock: can nest under outer_lock and/or node lock * @outer_lock: no nesting under innor or node lock * Lock order: 1) outer, 2) node, 3) inner * * Bookkeeping structure for binder processes */ struct binder_proc { struct hlist_node proc_node; struct rb_root threads; struct rb_root nodes; struct rb_root refs_by_desc; struct rb_root refs_by_node; struct list_head waiting_threads; int pid; struct task_struct *tsk; struct files_struct *files; struct mutex files_lock; const struct cred *cred; struct hlist_node deferred_work_node; int deferred_work; bool is_dead; struct list_head todo; struct binder_stats stats; struct list_head delivered_death; int max_threads; int requested_threads; int reques |