| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* V4L2 device support header. Copyright (C) 2008 Hans Verkuil <hverkuil@xs4all.nl> */ #ifndef _V4L2_DEVICE_H #define _V4L2_DEVICE_H #include <media/media-device.h> #include <media/v4l2-subdev.h> #include <media/v4l2-dev.h> struct v4l2_ctrl_handler; /** * struct v4l2_device - main struct to for V4L2 device drivers * * @dev: pointer to struct device. * @mdev: pointer to struct media_device, may be NULL. * @subdevs: used to keep track of the registered subdevs * @lock: lock this struct; can be used by the driver as well * if this struct is embedded into a larger struct. * @name: unique device name, by default the driver name + bus ID * @notify: notify operation called by some sub-devices. * @ctrl_handler: The control handler. May be %NULL. * @prio: Device's priority state * @ref: Keep track of the references to this struct. * @release: Release function that is called when the ref count * goes to 0. * * Each instance of a V4L2 device should create the v4l2_device struct, * either stand-alone or embedded in a larger struct. * * It allows easy access to sub-devices (see v4l2-subdev.h) and provides * basic V4L2 device-level support. * * .. note:: * * #) @dev->driver_data points to this struct. * #) @dev might be %NULL if there is no parent device */ struct v4l2_device { struct device *dev; struct media_device *mdev; struct list_head subdevs; spinlock_t lock; char name[36]; void (*notify)(struct v4l2_subdev *sd, unsigned int notification, void *arg); struct v4l2_ctrl_handler *ctrl_handler; struct v4l2_prio_state prio; struct kref ref; void (*release)(struct v4l2_device *v4l2_dev); }; /** * v4l2_device_get - gets a V4L2 device reference * * @v4l2_dev: pointer to struct &v4l2_device * * This is an ancillary routine meant to increment the usage for the * struct &v4l2_device pointed by @v4l2_dev. */ static inline void v4l2_device_get(struct v4l2_device *v4l2_dev) { kref_get(&v4l2_dev->ref); } /** * v4l2_device_put - puts a V4L2 device reference * * @v4l2_dev: pointer to struct &v4l2_device * * This is an ancillary routine meant to decrement the usage for the * struct &v4l2_device pointed by @v4l2_dev. */ int v4l2_device_put(struct v4l2_device *v4l2_dev); /** * v4l2_device_register - Initialize v4l2_dev and make @dev->driver_data * point to @v4l2_dev. * * @dev: pointer to struct &device * @v4l2_dev: pointer to struct &v4l2_device * * .. note:: * @dev may be %NULL in rare cases (ISA devices). * In such case the caller must fill in the @v4l2_dev->name field * before calling this function. */ int __must_check v4l2_device_register(struct device *dev, struct v4l2_device *v4l2_dev); /** * v4l2_device_set_name - Optional function to initialize the * name field of struct &v4l2_device * * @v4l2_dev: pointer to struct &v4l2_device * @basename: base name for the device name * @instance: pointer to a static atomic_t var with the instance usage for * the device driver. * * v4l2_device_set_name() initializes the name field of struct &v4l2_device * using the driver name and a driver-global atomic_t instance. * * This function will increment the instance counter and returns the * instance value used in the name. * * Example: * * static atomic_t drv_instance = ATOMIC_INIT(0); * * ... * * instance = v4l2_device_set_name(&\ v4l2_dev, "foo", &\ drv_instance); * * The first time this is called the name field will be set to foo0 and * this function returns 0. If the name ends with a digit (e.g. cx18), * then the name will be set to cx18-0 since cx180 would look really odd. */ int v4l2_device_set_name(struct v4l2_device *v4l2_dev, const char *basename, atomic_t *instance); /** * v4l2_device_disconnect - Change V4L2 device state to disconnected. * * @v4l2_dev: pointer to struct v4l2_device * * Should be called when the USB parent disconnects. * Since the parent disappears, this ensures that @v4l2_dev doesn't have * an invalid parent pointer. * * .. note:: This function sets @v4l2_dev->dev to NULL. */ void v4l2_device_disconnect(struct v4l2_device *v4l2_dev); /** * v4l2_device_unregister - Unregister all sub-devices and any other * resources related to @v4l2_dev. * * @v4l2_dev: pointer to struct v4l2_device */ void v4l2_device_unregister(struct v4l2_device *v4l2_dev); /** * v4l2_device_register_subdev - Registers a subdev with a v4l2 device. * * @v4l2_dev: pointer to struct &v4l2_device * @sd: pointer to &struct v4l2_subdev * * While registered, the subdev module is marked as in-use. * * An error is returned if the module is no longer loaded on any attempts * to register it. */ #define v4l2_device_register_subdev(v4l2_dev, sd) \ __v4l2_device_register_subdev(v4l2_dev, sd, THIS_MODULE) int __must_check __v4l2_device_register_subdev(struct v4l2_device *v4l2_dev, struct v4l2_subdev *sd, struct module *module); /** * v4l2_device_unregister_subdev - Unregisters a subdev with a v4l2 device. * * @sd: pointer to &struct v4l2_subdev * * .. note :: * * Can also be called if the subdev wasn't registered. In such * case, it will do nothing. */ void v4l2_device_unregister_subdev(struct v4l2_subdev *sd); /** * __v4l2_device_register_subdev_nodes - Registers device nodes for * all subdevs of the v4l2 device that are marked with the * %V4L2_SUBDEV_FL_HAS_DEVNODE flag. * * @v4l2_dev: pointer to struct v4l2_device * @read_only: subdevices read-only flag. True to register the subdevices * device nodes in read-only mode, false to allow full access to the * subdevice userspace API. */ int __must_check __v4l2_device_register_subdev_nodes(struct v4l2_device *v4l2_dev, bool read_only); /** * v4l2_device_register_subdev_nodes - Registers subdevices device nodes with * unrestricted access to the subdevice userspace operations * * Internally calls __v4l2_device_register_subdev_nodes(). See its documentation * for more details. * * @v4l2_dev: pointer to struct v4l2_device */ static inline int __must_check v4l2_device_register_subdev_nodes(struct v4l2_device *v4l2_dev) { #if defined(CONFIG_VIDEO_V4L2_SUBDEV_API) return __v4l2_device_register_subdev_nodes(v4l2_dev, false); #else return 0; #endif } /** * v4l2_device_register_ro_subdev_nodes - Registers subdevices device nodes * in read-only mode * * Internally calls __v4l2_device_register_subdev_nodes(). See its documentation * for more details. * * @v4l2_dev: pointer to struct v4l2_device */ static inline int __must_check v4l2_device_register_ro_subdev_nodes(struct v4l2_device *v4l2_dev) { #if defined(CONFIG_VIDEO_V4L2_SUBDEV_API) return __v4l2_device_register_subdev_nodes(v4l2_dev, true); #else return 0; #endif } /** * v4l2_subdev_notify - Sends a notification to v4l2_device. * * @sd: pointer to &struct v4l2_subdev * @notification: type of notification. Please notice that the notification * type is driver-specific. * @arg: arguments for the notification. Those are specific to each * notification type. */ static inline void v4l2_subdev_notify(struct v4l2_subdev *sd, unsigned int notification, void *arg) { if (sd && sd->v4l2_dev && sd->v4l2_dev->notify) sd->v4l2_dev->notify(sd, notification, arg); } /** * v4l2_device_supports_requests - Test if requests are supported. * * @v4l2_dev: pointer to struct v4l2_device */ static inline bool v4l2_device_supports_requests(struct v4l2_device *v4l2_dev) { return v4l2_dev->mdev && v4l2_dev->mdev->ops && v4l2_dev->mdev->ops->req_queue; } /* Helper macros to iterate over all subdevs. */ /** * v4l2_device_for_each_subdev - Helper macro that interates over all * sub-devices of a given &v4l2_device. * * @sd: pointer that will be filled by the macro with all * &struct v4l2_subdev pointer used as an iterator by the loop. * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * * This macro iterates over all sub-devices owned by the @v4l2_dev device. * It acts as a for loop iterator and executes the next statement with * the @sd variable pointing to each sub-device in turn. */ #define v4l2_device_for_each_subdev(sd, v4l2_dev) \ list_for_each_entry(sd, &(v4l2_dev)->subdevs, list) /** * __v4l2_device_call_subdevs_p - Calls the specified operation for * all subdevs matching the condition. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @sd: pointer that will be filled by the macro with all * &struct v4l2_subdev pointer used as an iterator by the loop. * @cond: condition to be match * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * Ignore any errors. * * Note: subdevs cannot be added or deleted while walking * the subdevs list. */ #define __v4l2_device_call_subdevs_p(v4l2_dev, sd, cond, o, f, args...) \ do { \ list_for_each_entry((sd), &(v4l2_dev)->subdevs, list) \ if ((cond) && (sd)->ops->o && (sd)->ops->o->f) \ (sd)->ops->o->f((sd) , ##args); \ } while (0) /** * __v4l2_device_call_subdevs - Calls the specified operation for * all subdevs matching the condition. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @cond: condition to be match * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * Ignore any errors. * * Note: subdevs cannot be added or deleted while walking * the subdevs list. */ #define __v4l2_device_call_subdevs(v4l2_dev, cond, o, f, args...) \ do { \ struct v4l2_subdev *__sd; \ \ __v4l2_device_call_subdevs_p(v4l2_dev, __sd, cond, o, \ f , ##args); \ } while (0) /** * __v4l2_device_call_subdevs_until_err_p - Calls the specified operation for * all subdevs matching the condition. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @sd: pointer that will be filled by the macro with all * &struct v4l2_subdev sub-devices associated with @v4l2_dev. * @cond: condition to be match * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * Return: * * If the operation returns an error other than 0 or ``-ENOIOCTLCMD`` * for any subdevice, then abort and return with that error code, zero * otherwise. * * Note: subdevs cannot be added or deleted while walking * the subdevs list. */ #define __v4l2_device_call_subdevs_until_err_p(v4l2_dev, sd, cond, o, f, args...) \ ({ \ long __err = 0; \ \ list_for_each_entry((sd), &(v4l2_dev)->subdevs, list) { \ if ((cond) && (sd)->ops->o && (sd)->ops->o->f) \ __err = (sd)->ops->o->f((sd) , ##args); \ if (__err && __err != -ENOIOCTLCMD) \ break; \ } \ (__err == -ENOIOCTLCMD) ? 0 : __err; \ }) /** * __v4l2_device_call_subdevs_until_err - Calls the specified operation for * all subdevs matching the condition. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @cond: condition to be match * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * Return: * * If the operation returns an error other than 0 or ``-ENOIOCTLCMD`` * for any subdevice, then abort and return with that error code, * zero otherwise. * * Note: subdevs cannot be added or deleted while walking * the subdevs list. */ #define __v4l2_device_call_subdevs_until_err(v4l2_dev, cond, o, f, args...) \ ({ \ struct v4l2_subdev *__sd; \ __v4l2_device_call_subdevs_until_err_p(v4l2_dev, __sd, cond, o, \ f , ##args); \ }) /** * v4l2_device_call_all - Calls the specified operation for * all subdevs matching the &v4l2_subdev.grp_id, as assigned * by the bridge driver. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @grpid: &struct v4l2_subdev->grp_id group ID to match. * Use 0 to match them all. * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * Ignore any errors. * * Note: subdevs cannot be added or deleted while walking * the subdevs list. */ #define v4l2_device_call_all(v4l2_dev, grpid, o, f, args...) \ do { \ struct v4l2_subdev *__sd; \ \ __v4l2_device_call_subdevs_p(v4l2_dev, __sd, \ (grpid) == 0 || __sd->grp_id == (grpid), o, f , \ ##args); \ } while (0) /** * v4l2_device_call_until_err - Calls the specified operation for * all subdevs matching the &v4l2_subdev.grp_id, as assigned * by the bridge driver, until an error occurs. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @grpid: &struct v4l2_subdev->grp_id group ID to match. * Use 0 to match them all. * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * Return: * * If the operation returns an error other than 0 or ``-ENOIOCTLCMD`` * for any subdevice, then abort and return with that error code, * zero otherwise. * * Note: subdevs cannot be added or deleted while walking * the subdevs list. */ #define v4l2_device_call_until_err(v4l2_dev, grpid, o, f, args...) \ ({ \ struct v4l2_subdev *__sd; \ __v4l2_device_call_subdevs_until_err_p(v4l2_dev, __sd, \ (grpid) == 0 || __sd->grp_id == (grpid), o, f , \ ##args); \ }) /** * v4l2_device_mask_call_all - Calls the specified operation for * all subdevices where a group ID matches a specified bitmask. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @grpmsk: bitmask to be checked against &struct v4l2_subdev->grp_id * group ID to be matched. Use 0 to match them all. * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * Ignore any errors. * * Note: subdevs cannot be added or deleted while walking * the subdevs list. */ #define v4l2_device_mask_call_all(v4l2_dev, grpmsk, o, f, args...) \ do { \ struct v4l2_subdev *__sd; \ \ __v4l2_device_call_subdevs_p(v4l2_dev, __sd, \ (grpmsk) == 0 || (__sd->grp_id & (grpmsk)), o, \ f , ##args); \ } while (0) /** * v4l2_device_mask_call_until_err - Calls the specified operation for * all subdevices where a group ID matches a specified bitmask. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @grpmsk: bitmask to be checked against &struct v4l2_subdev->grp_id * group ID to be matched. Use 0 to match them all. * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * Return: * * If the operation returns an error other than 0 or ``-ENOIOCTLCMD`` * for any subdevice, then abort and return with that error code, * zero otherwise. * * Note: subdevs cannot be added or deleted while walking * the subdevs list. */ #define v4l2_device_mask_call_until_err(v4l2_dev, grpmsk, o, f, args...) \ ({ \ struct v4l2_subdev *__sd; \ __v4l2_device_call_subdevs_until_err_p(v4l2_dev, __sd, \ (grpmsk) == 0 || (__sd->grp_id & (grpmsk)), o, \ f , ##args); \ }) /** * v4l2_device_has_op - checks if any subdev with matching grpid has a * given ops. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @grpid: &struct v4l2_subdev->grp_id group ID to match. * Use 0 to match them all. * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. */ #define v4l2_device_has_op(v4l2_dev, grpid, o, f) \ ({ \ struct v4l2_subdev *__sd; \ bool __result = false; \ list_for_each_entry(__sd, &(v4l2_dev)->subdevs, list) { \ if ((grpid) && __sd->grp_id != (grpid)) \ continue; \ if (v4l2_subdev_has_op(__sd, o, f)) { \ __result = true; \ break; \ } \ } \ __result; \ }) /** * v4l2_device_mask_has_op - checks if any subdev with matching group * mask has a given ops. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @grpmsk: bitmask to be checked against &struct v4l2_subdev->grp_id * group ID to be matched. Use 0 to match them all. * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. */ #define v4l2_device_mask_has_op(v4l2_dev, grpmsk, o, f) \ ({ \ struct v4l2_subdev *__sd; \ bool __result = false; \ list_for_each_entry(__sd, &(v4l2_dev)->subdevs, list) { \ if ((grpmsk) && !(__sd->grp_id & (grpmsk))) \ continue; \ if (v4l2_subdev_has_op(__sd, o, f)) { \ __result = true; \ break; \ } \ } \ __result; \ }) #endif |
| 3928 5679 1 29 29 29 27 1 1 28 28 337 335 1 75 337 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Fast and scalable bitmaps. * * Copyright (C) 2016 Facebook * Copyright (C) 2013-2014 Jens Axboe */ #ifndef __LINUX_SCALE_BITMAP_H #define __LINUX_SCALE_BITMAP_H #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/cache.h> #include <linux/list.h> #include <linux/log2.h> #include <linux/minmax.h> #include <linux/percpu.h> #include <linux/slab.h> #include <linux/smp.h> #include <linux/types.h> #include <linux/wait.h> struct seq_file; /** * struct sbitmap_word - Word in a &struct sbitmap. */ struct sbitmap_word { /** * @word: word holding free bits */ unsigned long word; /** * @cleared: word holding cleared bits */ unsigned long cleared ____cacheline_aligned_in_smp; /** * @swap_lock: serializes simultaneous updates of ->word and ->cleared */ raw_spinlock_t swap_lock; } ____cacheline_aligned_in_smp; /** * struct sbitmap - Scalable bitmap. * * A &struct sbitmap is spread over multiple cachelines to avoid ping-pong. This * trades off higher memory usage for better scalability. */ struct sbitmap { /** * @depth: Number of bits used in the whole bitmap. */ unsigned int depth; /** * @shift: log2(number of bits used per word) */ unsigned int shift; /** * @map_nr: Number of words (cachelines) being used for the bitmap. */ unsigned int map_nr; /** * @round_robin: Allocate bits in strict round-robin order. */ bool round_robin; /** * @map: Allocated bitmap. */ struct sbitmap_word *map; /* * @alloc_hint: Cache of last successfully allocated or freed bit. * * This is per-cpu, which allows multiple users to stick to different * cachelines until the map is exhausted. */ unsigned int __percpu *alloc_hint; }; #define SBQ_WAIT_QUEUES 8 #define SBQ_WAKE_BATCH 8 /** * struct sbq_wait_state - Wait queue in a &struct sbitmap_queue. */ struct sbq_wait_state { /** * @wait: Wait queue. */ wait_queue_head_t wait; } ____cacheline_aligned_in_smp; /** * struct sbitmap_queue - Scalable bitmap with the added ability to wait on free * bits. * * A &struct sbitmap_queue uses multiple wait queues and rolling wakeups to * avoid contention on the wait queue spinlock. This ensures that we don't hit a * scalability wall when we run out of free bits and have to start putting tasks * to sleep. */ struct sbitmap_queue { /** * @sb: Scalable bitmap. */ struct sbitmap sb; /** * @wake_batch: Number of bits which must be freed before we wake up any * waiters. */ unsigned int wake_batch; /** * @wake_index: Next wait queue in @ws to wake up. */ atomic_t wake_index; /** * @ws: Wait queues. */ struct sbq_wait_state *ws; /* * @ws_active: count of currently active ws waitqueues */ atomic_t ws_active; /** * @min_shallow_depth: The minimum shallow depth which may be passed to * sbitmap_queue_get_shallow() */ unsigned int min_shallow_depth; /** * @completion_cnt: Number of bits cleared passed to the * wakeup function. */ atomic_t completion_cnt; /** * @wakeup_cnt: Number of thread wake ups issued. */ atomic_t wakeup_cnt; }; /** * sbitmap_init_node() - Initialize a &struct sbitmap on a specific memory node. * @sb: Bitmap to initialize. * @depth: Number of bits to allocate. * @shift: Use 2^@shift bits per word in the bitmap; if a negative number if * given, a good default is chosen. * @flags: Allocation flags. * @node: Memory node to allocate on. * @round_robin: If true, be stricter about allocation order; always allocate * starting from the last allocated bit. This is less efficient * than the default behavior (false). * @alloc_hint: If true, apply percpu hint for where to start searching for * a free bit. * * Return: Zero on success or negative errno on failure. */ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, gfp_t flags, int node, bool round_robin, bool alloc_hint); /* sbitmap internal helper */ static inline unsigned int __map_depth(const struct sbitmap *sb, int index) { if (index == sb->map_nr - 1) return sb->depth - (index << sb->shift); return 1U << sb->shift; } /** * sbitmap_free() - Free memory used by a &struct sbitmap. * @sb: Bitmap to free. */ static inline void sbitmap_free(struct sbitmap *sb) { free_percpu(sb->alloc_hint); kvfree(sb->map); sb->map = NULL; } /** * sbitmap_resize() - Resize a &struct sbitmap. * @sb: Bitmap to resize. * @depth: New number of bits to resize to. * * Doesn't reallocate anything. It's up to the caller to ensure that the new * depth doesn't exceed the depth that the sb was initialized with. */ void sbitmap_resize(struct sbitmap *sb, unsigned int depth); /** * sbitmap_get() - Try to allocate a free bit from a &struct sbitmap. * @sb: Bitmap to allocate from. * * This operation provides acquire barrier semantics if it succeeds. * * Return: Non-negative allocated bit number if successful, -1 otherwise. */ int sbitmap_get(struct sbitmap *sb); /** * sbitmap_get_shallow() - Try to allocate a free bit from a &struct sbitmap, * limiting the depth used from each word. * @sb: Bitmap to allocate from. * @shallow_depth: The maximum number of bits to allocate from a single word. * * This rather specific operation allows for having multiple users with * different allocation limits. E.g., there can be a high-priority class that * uses sbitmap_get() and a low-priority class that uses sbitmap_get_shallow() * with a @shallow_depth of (1 << (@sb->shift - 1)). Then, the low-priority * class can only allocate half of the total bits in the bitmap, preventing it * from starving out the high-priority class. * * Return: Non-negative allocated bit number if successful, -1 otherwise. */ int sbitmap_get_shallow(struct sbitmap *sb, unsigned long shallow_depth); /** * sbitmap_any_bit_set() - Check for a set bit in a &struct sbitmap. * @sb: Bitmap to check. * * Return: true if any bit in the bitmap is set, false otherwise. */ bool sbitmap_any_bit_set(const struct sbitmap *sb); #define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift) #define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U)) typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *); /** * __sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap. * @start: Where to start the iteration. * @sb: Bitmap to iterate over. * @fn: Callback. Should return true to continue or false to break early. * @data: Pointer to pass to callback. * * This is inline even though it's non-trivial so that the function calls to the * callback will hopefully get optimized away. */ static inline void __sbitmap_for_each_set(struct sbitmap *sb, unsigned int start, sb_for_each_fn fn, void *data) { unsigned int index; unsigned int nr; unsigned int scanned = 0; if (start >= sb->depth) start = 0; index = SB_NR_TO_INDEX(sb, start); nr = SB_NR_TO_BIT(sb, start); while (scanned < sb->depth) { unsigned long word; unsigned int depth = min_t(unsigned int, __map_depth(sb, index) - nr, sb->depth - scanned); scanned += depth; word = sb->map[index].word & ~sb->map[index].cleared; if (!word) goto next; /* * On the first iteration of the outer loop, we need to add the * bit offset back to the size of the word for find_next_bit(). * On all other iterations, nr is zero, so this is a noop. */ depth += nr; while (1) { nr = find_next_bit(&word, depth, nr); if (nr >= depth) break; if (!fn(sb, (index << sb->shift) + nr, data)) return; nr++; } next: nr = 0; if (++index >= sb->map_nr) index = 0; } } /** * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap. * @sb: Bitmap to iterate over. * @fn: Callback. Should return true to continue or false to break early. * @data: Pointer to pass to callback. */ static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn, void *data) { __sbitmap_for_each_set(sb, 0, fn, data); } static inline unsigned long *__sbitmap_word(struct sbitmap *sb, unsigned int bitnr) { return &sb->map[SB_NR_TO_INDEX(sb, bitnr)].word; } /* Helpers equivalent to the operations in asm/bitops.h and linux/bitmap.h */ static inline void sbitmap_set_bit(struct sbitmap *sb, unsigned int bitnr) { set_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); } static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr) { clear_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); } /* * This one is special, since it doesn't actually clear the bit, rather it * sets the corresponding bit in the ->cleared mask instead. Paired with * the caller doing sbitmap_deferred_clear() if a given index is full, which * will clear the previously freed entries in the corresponding ->word. */ static inline void sbitmap_deferred_clear_bit(struct sbitmap *sb, unsigned int bitnr) { unsigned long *addr = &sb->map[SB_NR_TO_INDEX(sb, bitnr)].cleared; set_bit(SB_NR_TO_BIT(sb, bitnr), addr); } /* * Pair of sbitmap_get, and this one applies both cleared bit and * allocation hint. */ static inline void sbitmap_put(struct sbitmap *sb, unsigned int bitnr) { sbitmap_deferred_clear_bit(sb, bitnr); if (likely(sb->alloc_hint && !sb->round_robin && bitnr < sb->depth)) *raw_cpu_ptr(sb->alloc_hint) = bitnr; } static inline int sbitmap_test_bit(struct sbitmap *sb, unsigned int bitnr) { return test_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); } static inline int sbitmap_calculate_shift(unsigned int depth) { int shift = ilog2(BITS_PER_LONG); /* * If the bitmap is small, shrink the number of bits per word so * we spread over a few cachelines, at least. If less than 4 * bits, just forget about it, it's not going to work optimally * anyway. */ if (depth >= 4) { while ((4U << shift) > depth) shift--; } return shift; } /** * sbitmap_show() - Dump &struct sbitmap information to a &struct seq_file. * @sb: Bitmap to show. * @m: struct seq_file to write to. * * This is intended for debugging. The format may change at any time. */ void sbitmap_show(struct sbitmap *sb, struct seq_file *m); /** * sbitmap_weight() - Return how many set and not cleared bits in a &struct * sbitmap. * @sb: Bitmap to check. * * Return: How many set and not cleared bits set */ unsigned int sbitmap_weight(const struct sbitmap *sb); /** * sbitmap_bitmap_show() - Write a hex dump of a &struct sbitmap to a &struct * seq_file. * @sb: Bitmap to show. * @m: struct seq_file to write to. * * This is intended for debugging. The output isn't guaranteed to be internally * consistent. */ void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m); /** * sbitmap_queue_init_node() - Initialize a &struct sbitmap_queue on a specific * memory node. * @sbq: Bitmap queue to initialize. * @depth: See sbitmap_init_node(). * @shift: See sbitmap_init_node(). * @round_robin: See sbitmap_get(). * @flags: Allocation flags. * @node: Memory node to allocate on. * * Return: Zero on success or negative errno on failure. */ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, int shift, bool round_robin, gfp_t flags, int node); /** * sbitmap_queue_free() - Free memory used by a &struct sbitmap_queue. * * @sbq: Bitmap queue to free. */ static inline void sbitmap_queue_free(struct sbitmap_queue *sbq) { kfree(sbq->ws); sbitmap_free(&sbq->sb); } /** * sbitmap_queue_recalculate_wake_batch() - Recalculate wake batch * @sbq: Bitmap queue to recalculate wake batch. * @users: Number of shares. * * Like sbitmap_queue_update_wake_batch(), this will calculate wake batch * by depth. This interface is for HCTX shared tags or queue shared tags. */ void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq, unsigned int users); /** * sbitmap_queue_resize() - Resize a &struct sbitmap_queue. * @sbq: Bitmap queue to resize. * @depth: New number of bits to resize to. * * Like sbitmap_resize(), this doesn't reallocate anything. It has to do * some extra work on the &struct sbitmap_queue, so it's not safe to just * resize the underlying &struct sbitmap. */ void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth); /** * __sbitmap_queue_get() - Try to allocate a free bit from a &struct * sbitmap_queue with preemption already disabled. * @sbq: Bitmap queue to allocate from. * * Return: Non-negative allocated bit number if successful, -1 otherwise. */ int __sbitmap_queue_get(struct sbitmap_queue *sbq); /** * __sbitmap_queue_get_batch() - Try to allocate a batch of free bits * @sbq: Bitmap queue to allocate from. * @nr_tags: number of tags requested * @offset: offset to add to returned bits * * Return: Mask of allocated tags, 0 if none are found. Each tag allocated is * a bit in the mask returned, and the caller must add @offset to the value to * get the absolute tag value. */ unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags, unsigned int *offset); /** * sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct * sbitmap_queue, limiting the depth used from each word, with preemption * already disabled. * @sbq: Bitmap queue to allocate from. * @shallow_depth: The maximum number of bits to allocate from a single word. * See sbitmap_get_shallow(). * * If you call this, make sure to call sbitmap_queue_min_shallow_depth() after * initializing @sbq. * * Return: Non-negative allocated bit number if successful, -1 otherwise. */ int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq, unsigned int shallow_depth); /** * sbitmap_queue_get() - Try to allocate a free bit from a &struct * sbitmap_queue. * @sbq: Bitmap queue to allocate from. * @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to * sbitmap_queue_clear()). * * Return: Non-negative allocated bit number if successful, -1 otherwise. */ static inline int sbitmap_queue_get(struct sbitmap_queue *sbq, unsigned int *cpu) { int nr; *cpu = get_cpu(); nr = __sbitmap_queue_get(sbq); put_cpu(); return nr; } /** * sbitmap_queue_min_shallow_depth() - Inform a &struct sbitmap_queue of the * minimum shallow depth that will be used. * @sbq: Bitmap queue in question. * @min_shallow_depth: The minimum shallow depth that will be passed to * sbitmap_queue_get_shallow() or __sbitmap_queue_get_shallow(). * * sbitmap_queue_clear() batches wakeups as an optimization. The batch size * depends on the depth of the bitmap. Since the shallow allocation functions * effectively operate with a different depth, the shallow depth must be taken * into account when calculating the batch size. This function must be called * with the minimum shallow depth that will be used. Failure to do so can result * in missed wakeups. */ void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq, unsigned int min_shallow_depth); /** * sbitmap_queue_clear() - Free an allocated bit and wake up waiters on a * &struct sbitmap_queue. * @sbq: Bitmap to free from. * @nr: Bit number to free. * @cpu: CPU the bit was allocated on. */ void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, unsigned int cpu); /** * sbitmap_queue_clear_batch() - Free a batch of allocated bits * &struct sbitmap_queue. * @sbq: Bitmap to free from. * @offset: offset for each tag in array * @tags: array of tags * @nr_tags: number of tags in array */ void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset, int *tags, int nr_tags); static inline int sbq_index_inc(int index) { return (index + 1) & (SBQ_WAIT_QUEUES - 1); } static inline void sbq_index_atomic_inc(atomic_t *index) { int old = atomic_read(index); int new = sbq_index_inc(old); atomic_cmpxchg(index, old, new); } /** * sbq_wait_ptr() - Get the next wait queue to use for a &struct * sbitmap_queue. * @sbq: Bitmap queue to wait on. * @wait_index: A counter per "user" of @sbq. */ static inline struct sbq_wait_state *sbq_wait_ptr(struct sbitmap_queue *sbq, atomic_t *wait_index) { struct sbq_wait_state *ws; ws = &sbq->ws[atomic_read(wait_index)]; sbq_index_atomic_inc(wait_index); return ws; } /** * sbitmap_queue_wake_all() - Wake up everything waiting on a &struct * sbitmap_queue. * @sbq: Bitmap queue to wake up. */ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq); /** * sbitmap_queue_wake_up() - Wake up some of waiters in one waitqueue * on a &struct sbitmap_queue. * @sbq: Bitmap queue to wake up. * @nr: Number of bits cleared. */ void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr); /** * sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct * seq_file. * @sbq: Bitmap queue to show. * @m: struct seq_file to write to. * * This is intended for debugging. The format may change at any time. */ void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m); struct sbq_wait { struct sbitmap_queue *sbq; /* if set, sbq_wait is accounted */ struct wait_queue_entry wait; }; #define DEFINE_SBQ_WAIT(name) \ struct sbq_wait name = { \ .sbq = NULL, \ .wait = { \ .private = current, \ .func = autoremove_wake_function, \ .entry = LIST_HEAD_INIT((name).wait.entry), \ } \ } /* * Wrapper around prepare_to_wait_exclusive(), which maintains some extra * internal state. */ void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, struct sbq_wait *sbq_wait, int state); /* * Must be paired with sbitmap_prepare_to_wait(). */ void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, struct sbq_wait *sbq_wait); /* * Wrapper around add_wait_queue(), which maintains some extra internal state */ void sbitmap_add_wait_queue(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, struct sbq_wait *sbq_wait); /* * Must be paired with sbitmap_add_wait_queue() */ void sbitmap_del_wait_queue(struct sbq_wait *sbq_wait); #endif /* __LINUX_SCALE_BITMAP_H */ |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright (c) 1999 Andreas Gal * Copyright (c) 2000-2001 Vojtech Pavlik * Copyright (c) 2006-2007 Jiri Kosina */ /* * * Should you need to contact me, the author, you can do so either by * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail: * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic */ #ifndef __HID_H #define __HID_H #include <linux/bitops.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/mod_devicetable.h> /* hid_device_id */ #include <linux/timer.h> #include <linux/workqueue.h> #include <linux/input.h> #include <linux/semaphore.h> #include <linux/mutex.h> #include <linux/power_supply.h> #include <uapi/linux/hid.h> #include <linux/hid_bpf.h> /* * We parse each description item into this structure. Short items data * values are expanded to 32-bit signed int, long items contain a pointer * into the data area. */ struct hid_item { unsigned format; __u8 size; __u8 type; __u8 tag; union { __u8 u8; __s8 s8; __u16 u16; __s16 s16; __u32 u32; __s32 s32; const __u8 *longdata; } data; }; /* * HID report item format */ #define HID_ITEM_FORMAT_SHORT 0 #define HID_ITEM_FORMAT_LONG 1 /* * Special tag indicating long items */ #define HID_ITEM_TAG_LONG 15 /* * HID report descriptor item type (prefix bit 2,3) */ #define HID_ITEM_TYPE_MAIN 0 #define HID_ITEM_TYPE_GLOBAL 1 #define HID_ITEM_TYPE_LOCAL 2 #define HID_ITEM_TYPE_RESERVED 3 /* * HID report descriptor main item tags */ #define HID_MAIN_ITEM_TAG_INPUT 8 #define HID_MAIN_ITEM_TAG_OUTPUT 9 #define HID_MAIN_ITEM_TAG_FEATURE 11 #define HID_MAIN_ITEM_TAG_BEGIN_COLLECTION 10 #define HID_MAIN_ITEM_TAG_END_COLLECTION 12 /* * HID report descriptor main item contents */ #define HID_MAIN_ITEM_CONSTANT 0x001 #define HID_MAIN_ITEM_VARIABLE 0x002 #define HID_MAIN_ITEM_RELATIVE 0x004 #define HID_MAIN_ITEM_WRAP 0x008 #define HID_MAIN_ITEM_NONLINEAR 0x010 #define HID_MAIN_ITEM_NO_PREFERRED 0x020 #define HID_MAIN_ITEM_NULL_STATE 0x040 #define HID_MAIN_ITEM_VOLATILE 0x080 #define HID_MAIN_ITEM_BUFFERED_BYTE 0x100 /* * HID report descriptor collection item types */ #define HID_COLLECTION_PHYSICAL 0 #define HID_COLLECTION_APPLICATION 1 #define HID_COLLECTION_LOGICAL 2 #define HID_COLLECTION_NAMED_ARRAY 4 /* * HID report descriptor global item tags */ #define HID_GLOBAL_ITEM_TAG_USAGE_PAGE 0 #define HID_GLOBAL_ITEM_TAG_LOGICAL_MINIMUM 1 #define HID_GLOBAL_ITEM_TAG_LOGICAL_MAXIMUM 2 #define HID_GLOBAL_ITEM_TAG_PHYSICAL_MINIMUM 3 #define HID_GLOBAL_ITEM_TAG_PHYSICAL_MAXIMUM 4 #define HID_GLOBAL_ITEM_TAG_UNIT_EXPONENT 5 #define HID_GLOBAL_ITEM_TAG_UNIT 6 #define HID_GLOBAL_ITEM_TAG_REPORT_SIZE 7 #define HID_GLOBAL_ITEM_TAG_REPORT_ID 8 #define HID_GLOBAL_ITEM_TAG_REPORT_COUNT 9 #define HID_GLOBAL_ITEM_TAG_PUSH 10 #define HID_GLOBAL_ITEM_TAG_POP 11 /* * HID report descriptor local item tags */ #define HID_LOCAL_ITEM_TAG_USAGE 0 #define HID_LOCAL_ITEM_TAG_USAGE_MINIMUM 1 #define HID_LOCAL_ITEM_TAG_USAGE_MAXIMUM 2 #define HID_LOCAL_ITEM_TAG_DESIGNATOR_INDEX 3 #define HID_LOCAL_ITEM_TAG_DESIGNATOR_MINIMUM 4 #define HID_LOCAL_ITEM_TAG_DESIGNATOR_MAXIMUM 5 #define HID_LOCAL_ITEM_TAG_STRING_INDEX 7 #define HID_LOCAL_ITEM_TAG_STRING_MINIMUM 8 #define HID_LOCAL_ITEM_TAG_STRING_MAXIMUM 9 #define HID_LOCAL_ITEM_TAG_DELIMITER 10 /* * HID usage tables */ #define HID_USAGE_PAGE 0xffff0000 #define HID_UP_UNDEFINED 0x00000000 #define HID_UP_GENDESK 0x00010000 #define HID_UP_SIMULATION 0x00020000 #define HID_UP_GENDEVCTRLS 0x00060000 #define HID_UP_KEYBOARD 0x00070000 #define HID_UP_LED 0x00080000 #define HID_UP_BUTTON 0x00090000 #define HID_UP_ORDINAL 0x000a0000 #define HID_UP_TELEPHONY 0x000b0000 #define HID_UP_CONSUMER 0x000c0000 #define HID_UP_DIGITIZER 0x000d0000 #define HID_UP_PID 0x000f0000 #define HID_UP_BATTERY 0x00850000 #define HID_UP_CAMERA 0x00900000 #define HID_UP_HPVENDOR 0xff7f0000 #define HID_UP_HPVENDOR2 0xff010000 #define HID_UP_MSVENDOR 0xff000000 #define HID_UP_CUSTOM 0x00ff0000 #define HID_UP_LOGIVENDOR 0xffbc0000 #define HID_UP_LOGIVENDOR2 0xff090000 #define HID_UP_LOGIVENDOR3 0xff430000 #define HID_UP_LNVENDOR 0xffa00000 #define HID_UP_SENSOR 0x00200000 #define HID_UP_ASUSVENDOR 0xff310000 #define HID_UP_GOOGLEVENDOR 0xffd10000 #define HID_USAGE 0x0000ffff #define HID_GD_POINTER 0x00010001 #define HID_GD_MOUSE 0x00010002 #define HID_GD_JOYSTICK 0x00010004 #define HID_GD_GAMEPAD 0x00010005 #define HID_GD_KEYBOARD 0x00010006 #define HID_GD_KEYPAD 0x00010007 #define HID_GD_MULTIAXIS 0x00010008 /* * Microsoft Win8 Wireless Radio Controls extensions CA, see: * http://www.usb.org/developers/hidpage/HUTRR40RadioHIDUsagesFinal.pdf */ #define HID_GD_WIRELESS_RADIO_CTLS 0x0001000c /* * System Multi-Axis, see: * http://www.usb.org/developers/hidpage/HUTRR62_-_Generic_Desktop_CA_for_System_Multi-Axis_Controllers.txt */ #define HID_GD_SYSTEM_MULTIAXIS 0x0001000e #define HID_GD_X 0x00010030 #define HID_GD_Y 0x00010031 #define HID_GD_Z 0x00010032 #define HID_GD_RX 0x00010033 #define HID_GD_RY 0x00010034 #define HID_GD_RZ 0x00010035 #define HID_GD_SLIDER 0x00010036 #define HID_GD_DIAL 0x00010037 #define HID_GD_WHEEL 0x00010038 #define HID_GD_HATSWITCH 0x00010039 #define HID_GD_BUFFER 0x0001003a #define HID_GD_BYTECOUNT 0x0001003b #define HID_GD_MOTION 0x0001003c #define HID_GD_START 0x0001003d #define HID_GD_SELECT 0x0001003e #define HID_GD_VX 0x00010040 #define HID_GD_VY 0x00010041 #define HID_GD_VZ 0x00010042 #define HID_GD_VBRX 0x00010043 #define HID_GD_VBRY 0x00010044 #define HID_GD_VBRZ 0x00010045 #define HID_GD_VNO 0x00010046 #define HID_GD_FEATURE 0x00010047 #define HID_GD_RESOLUTION_MULTIPLIER 0x00010048 #define HID_GD_SYSTEM_CONTROL 0x00010080 #define HID_GD_UP 0x00010090 #define HID_GD_DOWN 0x00010091 #define HID_GD_RIGHT 0x00010092 #define HID_GD_LEFT 0x00010093 /* Microsoft Win8 Wireless Radio Controls CA usage codes */ #define HID_GD_RFKILL_BTN 0x000100c6 #define HID_GD_RFKILL_LED 0x000100c7 #define HID_GD_RFKILL_SWITCH 0x000100c8 #define HID_DC_BATTERYSTRENGTH 0x00060020 #define HID_CP_CONSUMER_CONTROL 0x000c0001 #define HID_CP_AC_PAN 0x000c0238 #define HID_DG_DIGITIZER 0x000d0001 #define HID_DG_PEN 0x000d0002 #define HID_DG_LIGHTPEN 0x000d0003 #define HID_DG_TOUCHSCREEN 0x000d0004 #define HID_DG_TOUCHPAD 0x000d0005 #define HID_DG_WHITEBOARD 0x000d0006 #define HID_DG_STYLUS 0x000d0020 #define HID_DG_PUCK 0x000d0021 #define HID_DG_FINGER 0x000d0022 #define HID_DG_TIPPRESSURE 0x000d0030 #define HID_DG_BARRELPRESSURE 0x000d0031 #define HID_DG_INRANGE 0x000d0032 #define HID_DG_TOUCH 0x000d0033 #define HID_DG_UNTOUCH 0x000d0034 #define HID_DG_TAP 0x000d0035 #define HID_DG_TRANSDUCER_INDEX 0x000d0038 #define HID_DG_TABLETFUNCTIONKEY 0x000d0039 #define HID_DG_PROGRAMCHANGEKEY 0x000d003a #define HID_DG_BATTERYSTRENGTH 0x000d003b #define HID_DG_INVERT 0x000d003c #define HID_DG_TILT_X 0x000d003d #define HID_DG_TILT_Y 0x000d003e #define HID_DG_TWIST 0x000d0041 #define HID_DG_TIPSWITCH 0x000d0042 #define HID_DG_TIPSWITCH2 0x000d0043 #define HID_DG_BARRELSWITCH 0x000d0044 #define HID_DG_ERASER 0x000d0045 #define HID_DG_TABLETPICK 0x000d0046 #define HID_DG_PEN_COLOR 0x000d005c #define HID_DG_PEN_LINE_WIDTH 0x000d005e #define HID_DG_PEN_LINE_STYLE 0x000d0070 #define HID_DG_PEN_LINE_STYLE_INK 0x000d0072 #define HID_DG_PEN_LINE_STYLE_PENCIL 0x000d0073 #define HID_DG_PEN_LINE_STYLE_HIGHLIGHTER 0x000d0074 #define HID_DG_PEN_LINE_STYLE_CHISEL_MARKER 0x000d0075 #define HID_DG_PEN_LINE_STYLE_BRUSH 0x000d0076 #define HID_DG_PEN_LINE_STYLE_NO_PREFERENCE 0x000d0077 #define HID_CP_CONSUMERCONTROL 0x000c0001 #define HID_CP_NUMERICKEYPAD 0x000c0002 #define HID_CP_PROGRAMMABLEBUTTONS 0x000c0003 #define HID_CP_MICROPHONE 0x000c0004 #define HID_CP_HEADPHONE 0x000c0005 #define HID_CP_GRAPHICEQUALIZER 0x000c0006 #define HID_CP_FUNCTIONBUTTONS 0x000c0036 #define HID_CP_SELECTION 0x000c0080 #define HID_CP_MEDIASELECTION 0x000c0087 #define HID_CP_SELECTDISC 0x000c00ba #define HID_CP_VOLUMEUP 0x000c00e9 #define HID_CP_VOLUMEDOWN 0x000c00ea #define HID_CP_PLAYBACKSPEED 0x000c00f1 #define HID_CP_PROXIMITY 0x000c0109 #define HID_CP_SPEAKERSYSTEM 0x000c0160 #define HID_CP_CHANNELLEFT 0x000c0161 #define HID_CP_CHANNELRIGHT 0x000c0162 #define HID_CP_CHANNELCENTER 0x000c0163 #define HID_CP_CHANNELFRONT 0x000c0164 #define HID_CP_CHANNELCENTERFRONT 0x000c0165 #define HID_CP_CHANNELSIDE 0x000c0166 #define HID_CP_CHANNELSURROUND 0x000c0167 #define HID_CP_CHANNELLOWFREQUENCYENHANCEMENT 0x000c0168 #define HID_CP_CHANNELTOP 0x000c0169 #define HID_CP_CHANNELUNKNOWN 0x000c016a #define HID_CP_APPLICATIONLAUNCHBUTTONS 0x000c0180 #define HID_CP_GENERICGUIAPPLICATIONCONTROLS 0x000c0200 #define HID_DG_DEVICECONFIG 0x000d000e #define HID_DG_DEVICESETTINGS 0x000d0023 #define HID_DG_AZIMUTH 0x000d003f #define HID_DG_CONFIDENCE 0x000d0047 #define HID_DG_WIDTH 0x000d0048 #define HID_DG_HEIGHT 0x000d0049 #define HID_DG_CONTACTID 0x000d0051 #define HID_DG_INPUTMODE 0x000d0052 #define HID_DG_DEVICEINDEX 0x000d0053 #define HID_DG_CONTACTCOUNT 0x000d0054 #define HID_DG_CONTACTMAX 0x000d0055 #define HID_DG_SCANTIME 0x000d0056 #define HID_DG_SURFACESWITCH 0x000d0057 #define HID_DG_BUTTONSWITCH 0x000d0058 #define HID_DG_BUTTONTYPE 0x000d0059 #define HID_DG_BARRELSWITCH2 0x000d005a #define HID_DG_TOOLSERIALNUMBER 0x000d005b #define HID_DG_LATENCYMODE 0x000d0060 #define HID_BAT_ABSOLUTESTATEOFCHARGE 0x00850065 #define HID_BAT_CHARGING 0x00850044 #define HID_VD_ASUS_CUSTOM_MEDIA_KEYS 0xff310076 /* * HID connect requests */ #define HID_CONNECT_HIDINPUT BIT(0) #define HID_CONNECT_HIDINPUT_FORCE BIT(1) #define HID_CONNECT_HIDRAW BIT(2) #define HID_CONNECT_HIDDEV BIT(3) #define HID_CONNECT_HIDDEV_FORCE BIT(4) #define HID_CONNECT_FF BIT(5) #define HID_CONNECT_DRIVER BIT(6) #define HID_CONNECT_DEFAULT (HID_CONNECT_HIDINPUT|HID_CONNECT_HIDRAW| \ HID_CONNECT_HIDDEV|HID_CONNECT_FF) /* * HID device quirks. */ /* * Increase this if you need to configure more HID quirks at module load time */ #define MAX_USBHID_BOOT_QUIRKS 4 /** * DOC: HID quirks * | @HID_QUIRK_NOTOUCH: * | @HID_QUIRK_IGNORE: ignore this device * | @HID_QUIRK_NOGET: * | @HID_QUIRK_HIDDEV_FORCE: * | @HID_QUIRK_BADPAD: * | @HID_QUIRK_MULTI_INPUT: * | @HID_QUIRK_HIDINPUT_FORCE: * | @HID_QUIRK_ALWAYS_POLL: * | @HID_QUIRK_INPUT_PER_APP: * | @HID_QUIRK_X_INVERT: * | @HID_QUIRK_Y_INVERT: * | @HID_QUIRK_SKIP_OUTPUT_REPORTS: * | @HID_QUIRK_SKIP_OUTPUT_REPORT_ID: * | @HID_QUIRK_NO_OUTPUT_REPORTS_ON_INTR_EP: * | @HID_QUIRK_HAVE_SPECIAL_DRIVER: * | @HID_QUIRK_INCREMENT_USAGE_ON_DUPLICATE: * | @HID_QUIRK_IGNORE_SPECIAL_DRIVER * | @HID_QUIRK_FULLSPEED_INTERVAL: * | @HID_QUIRK_NO_INIT_REPORTS: * | @HID_QUIRK_NO_IGNORE: * | @HID_QUIRK_NO_INPUT_SYNC: */ /* BIT(0) reserved for backward compatibility, was HID_QUIRK_INVERT */ #define HID_QUIRK_NOTOUCH BIT(1) #define HID_QUIRK_IGNORE BIT(2) #define HID_QUIRK_NOGET BIT(3) #define HID_QUIRK_HIDDEV_FORCE BIT(4) #define HID_QUIRK_BADPAD BIT(5) #define HID_QUIRK_MULTI_INPUT BIT(6) #define HID_QUIRK_HIDINPUT_FORCE BIT(7) /* BIT(8) reserved for backward compatibility, was HID_QUIRK_NO_EMPTY_INPUT */ /* BIT(9) reserved for backward compatibility, was NO_INIT_INPUT_REPORTS */ #define HID_QUIRK_ALWAYS_POLL BIT(10) #define HID_QUIRK_INPUT_PER_APP BIT(11) #define HID_QUIRK_X_INVERT BIT(12) #define HID_QUIRK_Y_INVERT BIT(13) #define HID_QUIRK_SKIP_OUTPUT_REPORTS BIT(16) #define HID_QUIRK_SKIP_OUTPUT_REPORT_ID BIT(17) #define HID_QUIRK_NO_OUTPUT_REPORTS_ON_INTR_EP BIT(18) #define HID_QUIRK_HAVE_SPECIAL_DRIVER BIT(19) #define HID_QUIRK_INCREMENT_USAGE_ON_DUPLICATE BIT(20) #define HID_QUIRK_NOINVERT BIT(21) #define HID_QUIRK_IGNORE_SPECIAL_DRIVER BIT(22) #define HID_QUIRK_FULLSPEED_INTERVAL BIT(28) #define HID_QUIRK_NO_INIT_REPORTS BIT(29) #define HID_QUIRK_NO_IGNORE BIT(30) #define HID_QUIRK_NO_INPUT_SYNC BIT(31) /* * HID device groups * * Note: HID_GROUP_ANY is declared in linux/mod_devicetable.h * and has a value of 0x0000 */ #define HID_GROUP_GENERIC 0x0001 #define HID_GROUP_MULTITOUCH 0x0002 #define HID_GROUP_SENSOR_HUB 0x0003 #define HID_GROUP_MULTITOUCH_WIN_8 0x0004 /* * Vendor specific HID device groups */ #define HID_GROUP_RMI 0x0100 #define HID_GROUP_WACOM 0x0101 #define HID_GROUP_LOGITECH_DJ_DEVICE 0x0102 #define HID_GROUP_STEAM 0x0103 #define HID_GROUP_LOGITECH_27MHZ_DEVICE 0x0104 #define HID_GROUP_VIVALDI 0x0105 /* * HID protocol status */ #define HID_REPORT_PROTOCOL 1 #define HID_BOOT_PROTOCOL 0 /* * This is the global environment of the parser. This information is * persistent for main-items. The global environment can be saved and * restored with PUSH/POP statements. */ struct hid_global { unsigned usage_page; __s32 logical_minimum; __s32 logical_maximum; __s32 physical_minimum; __s32 physical_maximum; __s32 unit_exponent; unsigned unit; unsigned report_id; unsigned report_size; unsigned report_count; }; /* * This is the local environment. It is persistent up the next main-item. */ #define HID_MAX_USAGES 12288 #define HID_DEFAULT_NUM_COLLECTIONS 16 struct hid_local { unsigned usage[HID_MAX_USAGES]; /* usage array */ u8 usage_size[HID_MAX_USAGES]; /* usage size array */ unsigned collection_index[HID_MAX_USAGES]; /* collection index array */ unsigned usage_index; unsigned usage_minimum; unsigned delimiter_depth; unsigned delimiter_branch; }; /* * This is the collection stack. We climb up the stack to determine * application and function of each field. */ struct hid_collection { int parent_idx; /* device->collection */ unsigned type; unsigned usage; unsigned level; }; struct hid_usage { unsigned hid; /* hid usage code */ unsigned collection_index; /* index into collection array */ unsigned usage_index; /* index into usage array */ __s8 resolution_multiplier;/* Effective Resolution Multiplier (HUT v1.12, 4.3.1), default: 1 */ /* hidinput data */ __s8 wheel_factor; /* 120/resolution_multiplier */ __u16 code; /* input driver code */ __u8 type; /* input driver type */ __s16 hat_min; /* hat switch fun */ __s16 hat_max; /* ditto */ __s16 hat_dir; /* ditto */ __s16 wheel_accumulated; /* hi-res wheel */ }; struct hid_input; struct hid_field { unsigned physical; /* physical usage for this field */ unsigned logical; /* logical usage for this field */ unsigned application; /* application usage for this field */ struct hid_usage *usage; /* usage table for this function */ unsigned maxusage; /* maximum usage index */ unsigned flags; /* main-item flags (i.e. volatile,array,constant) */ unsigned report_offset; /* bit offset in the report */ unsigned report_size; /* size of this field in the report */ unsigned report_count; /* number of this field in the report */ unsigned report_type; /* (input,output,feature) */ __s32 *value; /* last known value(s) */ __s32 *new_value; /* newly read value(s) */ __s32 *usages_priorities; /* priority of each usage when reading the report * bits 8-16 are reserved for hid-input usage */ __s32 logical_minimum; __s32 logical_maximum; __s32 physical_minimum; __s32 physical_maximum; __s32 unit_exponent; unsigned unit; bool ignored; /* this field is ignored in this event */ struct hid_report *report; /* associated report */ unsigned index; /* index into report->field[] */ /* hidinput data */ struct hid_input *hidinput; /* associated input structure */ __u16 dpad; /* dpad input code */ unsigned int slot_idx; /* slot index in a report */ }; #define HID_MAX_FIELDS 256 struct hid_field_entry { struct list_head list; struct hid_field *field; unsigned int index; __s32 priority; }; struct hid_report { struct list_head list; struct list_head hidinput_list; struct list_head field_entry_list; /* ordered list of input fields */ unsigned int id; /* id of this report */ enum hid_report_type type; /* report type */ unsigned int application; /* application usage for this report */ struct hid_field *field[HID_MAX_FIELDS]; /* fields of the report */ struct hid_field_entry *field_entries; /* allocated memory of input field_entry */ unsigned maxfield; /* maximum valid field index */ unsigned size; /* size of the report (bits) */ struct hid_device *device; /* associated device */ /* tool related state */ bool tool_active; /* whether the current tool is active */ unsigned int tool; /* BTN_TOOL_* */ }; #define HID_MAX_IDS 256 struct hid_report_enum { unsigned numbered; struct list_head report_list; struct hid_report *report_id_hash[HID_MAX_IDS]; }; #define HID_MIN_BUFFER_SIZE 64 /* make sure there is at least a packet size of space */ #define HID_MAX_BUFFER_SIZE 16384 /* 16kb */ #define HID_CONTROL_FIFO_SIZE 256 /* to init devices with >100 reports */ #define HID_OUTPUT_FIFO_SIZE 64 struct hid_control_fifo { unsigned char dir; struct hid_report *report; char *raw_report; }; struct hid_output_fifo { struct hid_report *report; char *raw_report; }; #define HID_CLAIMED_INPUT BIT(0) #define HID_CLAIMED_HIDDEV BIT(1) #define HID_CLAIMED_HIDRAW BIT(2) #define HID_CLAIMED_DRIVER BIT(3) #define HID_STAT_ADDED BIT(0) #define HID_STAT_PARSED BIT(1) #define HID_STAT_DUP_DETECTED BIT(2) #define HID_STAT_REPROBED BIT(3) struct hid_input { struct list_head list; struct hid_report *report; struct input_dev *input; const char *name; struct list_head reports; /* the list of reports */ unsigned int application; /* application usage for this input */ bool registered; }; enum hid_type { HID_TYPE_OTHER = 0, HID_TYPE_USBMOUSE, HID_TYPE_USBNONE }; enum hid_battery_status { HID_BATTERY_UNKNOWN = 0, HID_BATTERY_QUERIED, /* Kernel explicitly queried battery strength */ HID_BATTERY_REPORTED, /* Device sent unsolicited battery strength report */ }; struct hid_driver; struct hid_ll_driver; struct hid_device { const __u8 *dev_rdesc; /* device report descriptor */ const __u8 *bpf_rdesc; /* bpf modified report descriptor, if any */ const __u8 *rdesc; /* currently used report descriptor */ unsigned int dev_rsize; unsigned int bpf_rsize; unsigned int rsize; unsigned int collection_size; /* Number of allocated hid_collections */ struct hid_collection *collection; /* List of HID collections */ unsigned int maxcollection; /* Number of parsed collections */ unsigned int maxapplication; /* Number of applications */ __u16 bus; /* BUS ID */ __u16 group; /* Report group */ __u32 vendor; /* Vendor ID */ __u32 product; /* Product ID */ __u32 version; /* HID version */ enum hid_type type; /* device type (mouse, kbd, ...) */ unsigned country; /* HID country */ struct hid_report_enum report_enum[HID_REPORT_TYPES]; struct work_struct led_work; /* delayed LED worker */ struct semaphore driver_input_lock; /* protects the current driver */ struct device dev; /* device */ struct hid_driver *driver; void *devres_group_id; /* ID of probe devres group */ const struct hid_ll_driver *ll_driver; struct mutex ll_open_lock; unsigned int ll_open_count; #ifdef CONFIG_HID_BATTERY_STRENGTH /* * Power supply information for HID devices which report * battery strength. power_supply was successfully registered if * battery is non-NULL. */ struct power_supply *battery; __s32 battery_capacity; __s32 battery_min; __s32 battery_max; __s32 battery_report_type; __s32 battery_report_id; __s32 battery_charge_status; enum hid_battery_status battery_status; bool battery_avoid_query; ktime_t battery_ratelimit_time; #endif unsigned long status; /* see STAT flags above */ unsigned claimed; /* Claimed by hidinput, hiddev? */ unsigned quirks; /* Various quirks the device can pull on us */ unsigned initial_quirks; /* Initial set of quirks supplied when creating device */ bool io_started; /* If IO has started */ struct list_head inputs; /* The list of inputs */ void *hiddev; /* The hiddev structure */ void *hidraw; char name[128]; /* Device name */ char phys[64]; /* Device physical location */ char uniq[64]; /* Device unique identifier (serial #) */ void *driver_data; /* temporary hid_ff handling (until moved to the drivers) */ int (*ff_init)(struct hid_device *); /* hiddev event handler */ int (*hiddev_connect)(struct hid_device *, unsigned int); void (*hiddev_disconnect)(struct hid_device *); void (*hiddev_hid_event) (struct hid_device *, struct hid_field *field, struct hid_usage *, __s32); void (*hiddev_report_event) (struct hid_device *, struct hid_report *); /* debugging support via debugfs */ unsigned short debug; struct dentry *debug_dir; struct dentry *debug_rdesc; struct dentry *debug_events; struct list_head debug_list; spinlock_t debug_list_lock; wait_queue_head_t debug_wait; struct kref ref; unsigned int id; /* system unique id */ #ifdef CONFIG_HID_BPF struct hid_bpf bpf; /* hid-bpf data */ #endif /* CONFIG_HID_BPF */ }; void hiddev_free(struct kref *ref); #define to_hid_device(pdev) \ container_of(pdev, struct hid_device, dev) static inline void *hid_get_drvdata(struct hid_device *hdev) { return dev_get_drvdata(&hdev->dev); } static inline void hid_set_drvdata(struct hid_device *hdev, void *data) { dev_set_drvdata(&hdev->dev, data); } #define HID_GLOBAL_STACK_SIZE 4 #define HID_COLLECTION_STACK_SIZE 4 #define HID_SCAN_FLAG_MT_WIN_8 BIT(0) #define HID_SCAN_FLAG_VENDOR_SPECIFIC BIT(1) #define HID_SCAN_FLAG_GD_POINTER BIT(2) struct hid_parser { struct hid_global global; struct hid_global global_stack[HID_GLOBAL_STACK_SIZE]; unsigned int global_stack_ptr; struct hid_local local; unsigned int *collection_stack; unsigned int collection_stack_ptr; unsigned int collection_stack_size; struct hid_device *device; unsigned int scan_flags; }; struct hid_class_descriptor { __u8 bDescriptorType; __le16 wDescriptorLength; } __attribute__ ((packed)); struct hid_descriptor { __u8 bLength; __u8 bDescriptorType; __le16 bcdHID; __u8 bCountryCode; __u8 bNumDescriptors; struct hid_class_descriptor desc[1]; } __attribute__ ((packed)); #define HID_DEVICE(b, g, ven, prod) \ .bus = (b), .group = (g), .vendor = (ven), .product = (prod) #define HID_USB_DEVICE(ven, prod) \ .bus = BUS_USB, .vendor = (ven), .product = (prod) #define HID_BLUETOOTH_DEVICE(ven, prod) \ .bus = BUS_BLUETOOTH, .vendor = (ven), .product = (prod) #define HID_I2C_DEVICE(ven, prod) \ .bus = BUS_I2C, .vendor = (ven), .product = (prod) #define HID_REPORT_ID(rep) \ .report_type = (rep) #define HID_USAGE_ID(uhid, utype, ucode) \ .usage_hid = (uhid), .usage_type = (utype), .usage_code = (ucode) /* we don't want to catch types and codes equal to 0 */ #define HID_TERMINATOR (HID_ANY_ID - 1) struct hid_report_id { __u32 report_type; }; struct hid_usage_id { __u32 usage_hid; __u32 usage_type; __u32 usage_code; }; /** * struct hid_driver * @name: driver name (e.g. "Footech_bar-wheel") * @id_table: which devices is this driver for (must be non-NULL for probe * to be called) * @dyn_list: list of dynamically added device ids * @dyn_lock: lock protecting @dyn_list * @match: check if the given device is handled by this driver * @probe: new device inserted * @remove: device removed (NULL if not a hot-plug capable driver) * @report_table: on which reports to call raw_event (NULL means all) * @raw_event: if report in report_table, this hook is called (NULL means nop) * @usage_table: on which events to call event (NULL means all) * @event: if usage in usage_table, this hook is called (NULL means nop) * @report: this hook is called after parsing a report (NULL means nop) * @report_fixup: called before report descriptor parsing (NULL means nop) * @input_mapping: invoked on input registering before mapping an usage * @input_mapped: invoked on input registering after mapping an usage * @input_configured: invoked just before the device is registered * @feature_mapping: invoked on feature registering * @suspend: invoked on suspend (NULL means nop) * @resume: invoked on resume if device was not reset (NULL means nop) * @reset_resume: invoked on resume if device was reset (NULL means nop) * * probe should return -errno on error, or 0 on success. During probe, * input will not be passed to raw_event unless hid_device_io_start is * called. * * raw_event and event should return negative on error, any other value will * pass the event on to .event() typically return 0 for success. * * input_mapping shall return a negative value to completely ignore this usage * (e.g. doubled or invalid usage), zero to continue with parsing of this * usage by generic code (no special handling needed) or positive to skip * generic parsing (needed special handling which was done in the hook already) * input_mapped shall return negative to inform the layer that this usage * should not be considered for further processing or zero to notify that * no processing was performed and should be done in a generic manner * Both these functions may be NULL which means the same behavior as returning * zero from them. */ struct hid_driver { char *name; const struct hid_device_id *id_table; struct list_head dyn_list; spinlock_t dyn_lock; bool (*match)(struct hid_device *dev, bool ignore_special_driver); int (*probe)(struct hid_device *dev, const struct hid_device_id *id); void (*remove)(struct hid_device *dev); const struct hid_report_id *report_table; int (*raw_event)(struct hid_device *hdev, struct hid_report *report, u8 *data, int size); const struct hid_usage_id *usage_table; int (*event)(struct hid_device *hdev, struct hid_field *field, struct hid_usage *usage, __s32 value); void (*report)(struct hid_device *hdev, struct hid_report *report); const __u8 *(*report_fixup)(struct hid_device *hdev, __u8 *buf, unsigned int *size); int (*input_mapping)(struct hid_device *hdev, struct hid_input *hidinput, struct hid_field *field, struct hid_usage *usage, unsigned long **bit, int *max); int (*input_mapped)(struct hid_device *hdev, struct hid_input *hidinput, struct hid_field *field, struct hid_usage *usage, unsigned long **bit, int *max); int (*input_configured)(struct hid_device *hdev, struct hid_input *hidinput); void (*feature_mapping)(struct hid_device *hdev, struct hid_field *field, struct hid_usage *usage); int (*suspend)(struct hid_device *hdev, pm_message_t message); int (*resume)(struct hid_device *hdev); int (*reset_resume)(struct hid_device *hdev); /* private: */ struct device_driver driver; }; #define to_hid_driver(pdrv) \ container_of(pdrv, struct hid_driver, driver) /** * struct hid_ll_driver - low level driver callbacks * @start: called on probe to start the device * @stop: called on remove * @open: called by input layer on open * @close: called by input layer on close * @power: request underlying hardware to enter requested power mode * @parse: this method is called only once to parse the device data, * shouldn't allocate anything to not leak memory * @request: send report request to device (e.g. feature report) * @wait: wait for buffered io to complete (send/recv reports) * @raw_request: send raw report request to device (e.g. feature report) * @output_report: send output report to device * @idle: send idle request to device * @may_wakeup: return if device may act as a wakeup source during system-suspend * @max_buffer_size: over-ride maximum data buffer size (default: HID_MAX_BUFFER_SIZE) */ struct hid_ll_driver { int (*start)(struct hid_device *hdev); void (*stop)(struct hid_device *hdev); int (*open)(struct hid_device *hdev); void (*close)(struct hid_device *hdev); int (*power)(struct hid_device *hdev, int level); int (*parse)(struct hid_device *hdev); void (*request)(struct hid_device *hdev, struct hid_report *report, int reqtype); int (*wait)(struct hid_device *hdev); int (*raw_request) (struct hid_device *hdev, unsigned char reportnum, __u8 *buf, size_t len, unsigned char rtype, int reqtype); int (*output_report) (struct hid_device *hdev, __u8 *buf, size_t len); int (*idle)(struct hid_device *hdev, int report, int idle, int reqtype); bool (*may_wakeup)(struct hid_device *hdev); unsigned int max_buffer_size; }; extern bool hid_is_usb(const struct hid_device *hdev); #define PM_HINT_FULLON 1<<5 #define PM_HINT_NORMAL 1<<1 /* Applications from HID Usage Tables 4/8/99 Version 1.1 */ /* We ignore a few input applications that are not widely used */ #define IS_INPUT_APPLICATION(a) \ (((a >= HID_UP_GENDESK) && (a <= HID_GD_MULTIAXIS)) \ || ((a >= HID_DG_DIGITIZER) && (a <= HID_DG_WHITEBOARD)) \ || (a == HID_GD_SYSTEM_CONTROL) || (a == HID_CP_CONSUMER_CONTROL) \ || (a == HID_GD_WIRELESS_RADIO_CTLS)) /* HID core API */ extern bool hid_ignore(struct hid_device *); extern int hid_add_device(struct hid_device *); extern void hid_destroy_device(struct hid_device *); extern const struct bus_type hid_bus_type; extern int __must_check __hid_register_driver(struct hid_driver *, struct module *, const char *mod_name); /* use a define to avoid include chaining to get THIS_MODULE & friends */ #define hid_register_driver(driver) \ __hid_register_driver(driver, THIS_MODULE, KBUILD_MODNAME) extern void hid_unregister_driver(struct hid_driver *); /** * module_hid_driver() - Helper macro for registering a HID driver * @__hid_driver: hid_driver struct * * Helper macro for HID drivers which do not do anything special in module * init/exit. This eliminates a lot of boilerplate. Each module may only * use this macro once, and calling it replaces module_init() and module_exit() */ #define module_hid_driver(__hid_driver) \ module_driver(__hid_driver, hid_register_driver, \ hid_unregister_driver) extern void hidinput_hid_event(struct hid_device *, struct hid_field *, struct hid_usage *, __s32); extern void hidinput_report_event(struct hid_device *hid, struct hid_report *report); extern int hidinput_connect(struct hid_device *hid, unsigned int force); extern void hidinput_disconnect(struct hid_device *); struct hid_field *hid_find_field(struct hid_device *hdev, unsigned int report_type, unsigned int application, unsigned int usage); int hid_set_field(struct hid_field *, unsigned, __s32); int hid_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, int interrupt); struct hid_field *hidinput_get_led_field(struct hid_device *hid); unsigned int hidinput_count_leds(struct hid_device *hid); __s32 hidinput_calc_abs_res(const struct hid_field *field, __u16 code); void hid_output_report(struct hid_report *report, __u8 *data); int __hid_request(struct hid_device *hid, struct hid_report *rep, enum hid_class_request reqtype); u8 *hid_alloc_report_buf(struct hid_report *report, gfp_t flags); struct hid_device *hid_allocate_device(void); struct hid_report *hid_register_report(struct hid_device *device, enum hid_report_type type, unsigned int id, unsigned int application); int hid_parse_report(struct hid_device *hid, const __u8 *start, unsigned size); struct hid_report *hid_validate_values(struct hid_device *hid, enum hid_report_type type, unsigned int id, unsigned int field_index, unsigned int report_counts); void hid_setup_resolution_multiplier(struct hid_device *hid); int hid_open_report(struct hid_device *device); int hid_check_keys_pressed(struct hid_device *hid); int hid_connect(struct hid_device *hid, unsigned int connect_mask); void hid_disconnect(struct hid_device *hid); bool hid_match_one_id(const struct hid_device *hdev, const struct hid_device_id *id); const struct hid_device_id *hid_match_id(const struct hid_device *hdev, const struct hid_device_id *id); const struct hid_device_id *hid_match_device(struct hid_device *hdev, struct hid_driver *hdrv); bool hid_compare_device_paths(struct hid_device *hdev_a, struct hid_device *hdev_b, char separator); __u32 hid_field_extract(const struct hid_device *hid, __u8 *report, unsigned offset, unsigned n); #ifdef CONFIG_PM int hid_driver_suspend(struct hid_device *hdev, pm_message_t state); int hid_driver_reset_resume(struct hid_device *hdev); int hid_driver_resume(struct hid_device *hdev); #else static inline int hid_driver_suspend(struct hid_device *hdev, pm_message_t state) { return 0; } static inline int hid_driver_reset_resume(struct hid_device *hdev) { return 0; } static inline int hid_driver_resume(struct hid_device *hdev) { return 0; } #endif /** * hid_device_io_start - enable HID input during probe, remove * * @hid: the device * * This should only be called during probe or remove and only be * called by the thread calling probe or remove. It will allow * incoming packets to be delivered to the driver. */ static inline void hid_device_io_start(struct hid_device *hid) { if (hid->io_started) { dev_warn(&hid->dev, "io already started\n"); return; } hid->io_started = true; up(&hid->driver_input_lock); } /** * hid_device_io_stop - disable HID input during probe, remove * * @hid: the device * * Should only be called after hid_device_io_start. It will prevent * incoming packets from going to the driver for the duration of * probe, remove. If called during probe, packets will still go to the * driver after probe is complete. This function should only be called * by the thread calling probe or remove. */ static inline void hid_device_io_stop(struct hid_device *hid) { if (!hid->io_started) { dev_warn(&hid->dev, "io already stopped\n"); return; } hid->io_started = false; down(&hid->driver_input_lock); } /** * hid_map_usage - map usage input bits * * @hidinput: hidinput which we are interested in * @usage: usage to fill in * @bit: pointer to input->{}bit (out parameter) * @max: maximal valid usage->code to consider later (out parameter) * @type: input event type (EV_KEY, EV_REL, ...) * @c: code which corresponds to this usage and type * * The value pointed to by @bit will be set to NULL if either @type is * an unhandled event type, or if @c is out of range for @type. This * can be used as an error condition. */ static inline void hid_map_usage(struct hid_input *hidinput, struct hid_usage *usage, unsigned long **bit, int *max, __u8 type, unsigned int c) { struct input_dev *input = hidinput->input; unsigned long *bmap = NULL; unsigned int limit = 0; switch (type) { case EV_ABS: bmap = input->absbit; limit = ABS_MAX; break; case EV_REL: bmap = input->relbit; limit = REL_MAX; break; case EV_KEY: bmap = input->keybit; limit = KEY_MAX; break; case EV_LED: bmap = input->ledbit; limit = LED_MAX; break; case EV_MSC: bmap = input->mscbit; limit = MSC_MAX; break; } if (unlikely(c > limit || !bmap)) { pr_warn_ratelimited("%s: Invalid code %d type %d\n", input->name, c, type); *bit = NULL; return; } usage->type = type; usage->code = c; *max = limit; *bit = bmap; } /** * hid_map_usage_clear - map usage input bits and clear the input bit * * @hidinput: hidinput which we are interested in * @usage: usage to fill in * @bit: pointer to input->{}bit (out parameter) * @max: maximal valid usage->code to consider later (out parameter) * @type: input event type (EV_KEY, EV_REL, ...) * @c: code which corresponds to this usage and type * * The same as hid_map_usage, except the @c bit is also cleared in supported * bits (@bit). */ static inline void hid_map_usage_clear(struct hid_input *hidinput, struct hid_usage *usage, unsigned long **bit, int *max, __u8 type, __u16 c) { hid_map_usage(hidinput, usage, bit, max, type, c); if (*bit) clear_bit(usage->code, *bit); } /** * hid_parse - parse HW reports * * @hdev: hid device * * Call this from probe after you set up the device (if needed). Your * report_fixup will be called (if non-NULL) after reading raw report from * device before passing it to hid layer for real parsing. */ static inline int __must_check hid_parse(struct hid_device *hdev) { return hid_open_report(hdev); } int __must_check hid_hw_start(struct hid_device *hdev, unsigned int connect_mask); void hid_hw_stop(struct hid_device *hdev); int __must_check hid_hw_open(struct hid_device *hdev); void hid_hw_close(struct hid_device *hdev); void hid_hw_request(struct hid_device *hdev, struct hid_report *report, enum hid_class_request reqtype); int __hid_hw_raw_request(struct hid_device *hdev, unsigned char reportnum, __u8 *buf, size_t len, enum hid_report_type rtype, enum hid_class_request reqtype, __u64 source, bool from_bpf); int __hid_hw_output_report(struct hid_device *hdev, __u8 *buf, size_t len, __u64 source, bool from_bpf); int hid_hw_raw_request(struct hid_device *hdev, unsigned char reportnum, __u8 *buf, size_t len, enum hid_report_type rtype, enum hid_class_request reqtype); int hid_hw_output_report(struct hid_device *hdev, __u8 *buf, size_t len); /** * hid_hw_power - requests underlying HW to go into given power mode * * @hdev: hid device * @level: requested power level (one of %PM_HINT_* defines) * * This function requests underlying hardware to enter requested power * mode. */ static inline int hid_hw_power(struct hid_device *hdev, int level) { return hdev->ll_driver->power ? hdev->ll_driver->power(hdev, level) : 0; } /** * hid_hw_idle - send idle request to device * * @hdev: hid device * @report: report to control * @idle: idle state * @reqtype: hid request type */ static inline int hid_hw_idle(struct hid_device *hdev, int report, int idle, enum hid_class_request reqtype) { if (hdev->ll_driver->idle) return hdev->ll_driver->idle(hdev, report, idle, reqtype); return 0; } /** * hid_hw_may_wakeup - return if the hid device may act as a wakeup source during system-suspend * * @hdev: hid device */ static inline bool hid_hw_may_wakeup(struct hid_device *hdev) { if (hdev->ll_driver->may_wakeup) return hdev->ll_driver->may_wakeup(hdev); if (hdev->dev.parent) return device_may_wakeup(hdev->dev.parent); return false; } /** * hid_hw_wait - wait for buffered io to complete * * @hdev: hid device */ static inline void hid_hw_wait(struct hid_device *hdev) { if (hdev->ll_driver->wait) hdev->ll_driver->wait(hdev); } /** * hid_report_len - calculate the report length * * @report: the report we want to know the length */ static inline u32 hid_report_len(struct hid_report *report) { return DIV_ROUND_UP(report->size, 8) + (report->id > 0); } int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, int interrupt); /* HID quirks API */ unsigned long hid_lookup_quirk(const struct hid_device *hdev); int hid_quirks_init(char **quirks_param, __u16 bus, int count); void hid_quirks_exit(__u16 bus); #ifdef CONFIG_HID_PID int hid_pidff_init(struct hid_device *hid); #else #define hid_pidff_init NULL #endif #define dbg_hid(fmt, ...) pr_debug("%s: " fmt, __FILE__, ##__VA_ARGS__) #define hid_err(hid, fmt, ...) \ dev_err(&(hid)->dev, fmt, ##__VA_ARGS__) #define hid_notice(hid, fmt, ...) \ dev_notice(&(hid)->dev, fmt, ##__VA_ARGS__) #define hid_warn(hid, fmt, ...) \ dev_warn(&(hid)->dev, fmt, ##__VA_ARGS__) #define hid_info(hid, fmt, ...) \ dev_info(&(hid)->dev, fmt, ##__VA_ARGS__) #define hid_dbg(hid, fmt, ...) \ dev_dbg(&(hid)->dev, fmt, ##__VA_ARGS__) #define hid_err_once(hid, fmt, ...) \ dev_err_once(&(hid)->dev, fmt, ##__VA_ARGS__) #define hid_notice_once(hid, fmt, ...) \ dev_notice_once(&(hid)->dev, fmt, ##__VA_ARGS__) #define hid_warn_once(hid, fmt, ...) \ dev_warn_once(&(hid)->dev, fmt, ##__VA_ARGS__) #define hid_info_once(hid, fmt, ...) \ dev_info_once(&(hid)->dev, fmt, ##__VA_ARGS__) #define hid_dbg_once(hid, fmt, ...) \ dev_dbg_once(&(hid)->dev, fmt, ##__VA_ARGS__) #endif |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 | /* * include/linux/topology.h * * Written by: Matthew Dobson, IBM Corporation * * Copyright (C) 2002, IBM Corp. * * All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or * NON INFRINGEMENT. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * Send feedback to <colpatch@us.ibm.com> */ #ifndef _LINUX_TOPOLOGY_H #define _LINUX_TOPOLOGY_H #include <linux/arch_topology.h> #include <linux/cpumask.h> #include <linux/bitops.h> #include <linux/mmzone.h> #include <linux/smp.h> #include <linux/percpu.h> #include <asm/topology.h> #ifndef nr_cpus_node #define nr_cpus_node(node) cpumask_weight(cpumask_of_node(node)) #endif #define for_each_node_with_cpus(node) \ for_each_online_node(node) \ if (nr_cpus_node(node)) int arch_update_cpu_topology(void); /* Conform to ACPI 2.0 SLIT distance definitions */ #define LOCAL_DISTANCE 10 #define REMOTE_DISTANCE 20 #define DISTANCE_BITS 8 #ifndef node_distance #define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE) #endif #ifndef RECLAIM_DISTANCE /* * If the distance between nodes in a system is larger than RECLAIM_DISTANCE * (in whatever arch specific measurement units returned by node_distance()) * and node_reclaim_mode is enabled then the VM will only call node_reclaim() * on nodes within this distance. */ #define RECLAIM_DISTANCE 30 #endif /* * The following tunable allows platforms to override the default node * reclaim distance (RECLAIM_DISTANCE) if remote memory accesses are * sufficiently fast that the default value actually hurts * performance. * * AMD EPYC machines use this because even though the 2-hop distance * is 32 (3.2x slower than a local memory access) performance actually * *improves* if allowed to reclaim memory and load balance tasks * between NUMA nodes 2-hops apart. */ extern int __read_mostly node_reclaim_distance; #ifndef PENALTY_FOR_NODE_WITH_CPUS #define PENALTY_FOR_NODE_WITH_CPUS (1) #endif #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DECLARE_PER_CPU(int, numa_node); #ifndef numa_node_id /* Returns the number of the current Node. */ static inline int numa_node_id(void) { return raw_cpu_read(numa_node); } #endif #ifndef cpu_to_node static inline int cpu_to_node(int cpu) { return per_cpu(numa_node, cpu); } #endif #ifndef set_numa_node static inline void set_numa_node(int node) { this_cpu_write(numa_node, node); } #endif #ifndef set_cpu_numa_node static inline void set_cpu_numa_node(int cpu, int node) { per_cpu(numa_node, cpu) = node; } #endif #else /* !CONFIG_USE_PERCPU_NUMA_NODE_ID */ /* Returns the number of the current Node. */ #ifndef numa_node_id static inline int numa_node_id(void) { return cpu_to_node(raw_smp_processor_id()); } #endif #endif /* [!]CONFIG_USE_PERCPU_NUMA_NODE_ID */ #ifdef CONFIG_HAVE_MEMORYLESS_NODES /* * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem(). */ DECLARE_PER_CPU(int, _numa_mem_); #ifndef set_numa_mem static inline void set_numa_mem(int node) { this_cpu_write(_numa_mem_, node); } #endif #ifndef numa_mem_id /* Returns the number of the nearest Node with memory */ static inline int numa_mem_id(void) { return raw_cpu_read(_numa_mem_); } #endif #ifndef cpu_to_mem static inline int cpu_to_mem(int cpu) { return per_cpu(_numa_mem_, cpu); } #endif #ifndef set_cpu_numa_mem static inline void set_cpu_numa_mem(int cpu, int node) { per_cpu(_numa_mem_, cpu) = node; } #endif #else /* !CONFIG_HAVE_MEMORYLESS_NODES */ #ifndef numa_mem_id /* Returns the number of the nearest Node with memory */ static inline int numa_mem_id(void) { return numa_node_id(); } #endif #ifndef cpu_to_mem static inline int cpu_to_mem(int cpu) { return cpu_to_node(cpu); } #endif #endif /* [!]CONFIG_HAVE_MEMORYLESS_NODES */ #if defined(topology_die_id) && defined(topology_die_cpumask) #define TOPOLOGY_DIE_SYSFS #endif #if defined(topology_cluster_id) && defined(topology_cluster_cpumask) #define TOPOLOGY_CLUSTER_SYSFS #endif #if defined(topology_book_id) && defined(topology_book_cpumask) #define TOPOLOGY_BOOK_SYSFS #endif #if defined(topology_drawer_id) && defined(topology_drawer_cpumask) #define TOPOLOGY_DRAWER_SYSFS #endif #ifndef topology_physical_package_id #define topology_physical_package_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_die_id #define topology_die_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_cluster_id #define topology_cluster_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_core_id #define topology_core_id(cpu) ((void)(cpu), 0) #endif #ifndef topology_book_id #define topology_book_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_drawer_id #define topology_drawer_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_ppin #define topology_ppin(cpu) ((void)(cpu), 0ull) #endif #ifndef topology_sibling_cpumask #define topology_sibling_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_core_cpumask #define topology_core_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_cluster_cpumask #define topology_cluster_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_die_cpumask #define topology_die_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_book_cpumask #define topology_book_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_drawer_cpumask #define topology_drawer_cpumask(cpu) cpumask_of(cpu) #endif #if defined(CONFIG_SCHED_SMT) && !defined(cpu_smt_mask) static inline const struct cpumask *cpu_smt_mask(int cpu) { return topology_sibling_cpumask(cpu); } #endif static inline const struct cpumask *cpu_cpu_mask(int cpu) { return cpumask_of_node(cpu_to_node(cpu)); } #ifdef CONFIG_NUMA int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node); extern const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops); #else static __always_inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node) { return cpumask_nth_and(cpu, cpus, cpu_online_mask); } static inline const struct cpumask * sched_numa_hop_mask(unsigned int node, unsigned int hops) { return ERR_PTR(-EOPNOTSUPP); } #endif /* CONFIG_NUMA */ /** * for_each_numa_hop_mask - iterate over cpumasks of increasing NUMA distance * from a given node. * @mask: the iteration variable. * @node: the NUMA node to start the search from. * * Requires rcu_lock to be held. * * Yields cpu_online_mask for @node == NUMA_NO_NODE. */ #define for_each_numa_hop_mask(mask, node) \ for (unsigned int __hops = 0; \ mask = (node != NUMA_NO_NODE || __hops) ? \ sched_numa_hop_mask(node, __hops) : \ cpu_online_mask, \ !IS_ERR_OR_NULL(mask); \ __hops++) #endif /* _LINUX_TOPOLOGY_H */ |
| 9603 9614 6 6 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 | // SPDX-License-Identifier: GPL-2.0-only /* * x86 APERF/MPERF KHz calculation for * /sys/.../cpufreq/scaling_cur_freq * * Copyright (C) 2017 Intel Corp. * Author: Len Brown <len.brown@intel.com> */ #include <linux/cpufreq.h> #include <linux/delay.h> #include <linux/ktime.h> #include <linux/math64.h> #include <linux/percpu.h> #include <linux/rcupdate.h> #include <linux/sched/isolation.h> #include <linux/sched/topology.h> #include <linux/smp.h> #include <linux/syscore_ops.h> #include <asm/cpu.h> #include <asm/cpu_device_id.h> #include <asm/intel-family.h> #include "cpu.h" struct aperfmperf { seqcount_t seq; unsigned long last_update; u64 acnt; u64 mcnt; u64 aperf; u64 mperf; }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = { .seq = SEQCNT_ZERO(cpu_samples.seq) }; static void init_counter_refs(void) { u64 aperf, mperf; rdmsrl(MSR_IA32_APERF, aperf); rdmsrl(MSR_IA32_MPERF, mperf); this_cpu_write(cpu_samples.aperf, aperf); this_cpu_write(cpu_samples.mperf, mperf); } #if defined(CONFIG_X86_64) && defined(CONFIG_SMP) /* * APERF/MPERF frequency ratio computation. * * The scheduler wants to do frequency invariant accounting and needs a <1 * ratio to account for the 'current' frequency, corresponding to * freq_curr / freq_max. * * Since the frequency freq_curr on x86 is controlled by micro-controller and * our P-state setting is little more than a request/hint, we need to observe * the effective frequency 'BusyMHz', i.e. the average frequency over a time * interval after discarding idle time. This is given by: * * BusyMHz = delta_APERF / delta_MPERF * freq_base * * where freq_base is the max non-turbo P-state. * * The freq_max term has to be set to a somewhat arbitrary value, because we * can't know which turbo states will be available at a given point in time: * it all depends on the thermal headroom of the entire package. We set it to * the turbo level with 4 cores active. * * Benchmarks show that's a good compromise between the 1C turbo ratio * (freq_curr/freq_max would rarely reach 1) and something close to freq_base, * which would ignore the entire turbo range (a conspicuous part, making * freq_curr/freq_max always maxed out). * * An exception to the heuristic above is the Atom uarch, where we choose the * highest turbo level for freq_max since Atom's are generally oriented towards * power efficiency. * * Setting freq_max to anything less than the 1C turbo ratio makes the ratio * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1. */ DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key); static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE; static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE; void arch_set_max_freq_ratio(bool turbo_disabled) { arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE : arch_turbo_freq_ratio; } EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio); static bool __init turbo_disabled(void) { u64 misc_en; int err; err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en); if (err) return false; return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); } static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) { int err; err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq); if (err) return false; err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); if (err) return false; *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */ *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */ return true; } #define X86_MATCH(vfm) \ X86_MATCH_VFM_FEATURE(vfm, X86_FEATURE_APERFMPERF, NULL) static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = { X86_MATCH(INTEL_XEON_PHI_KNL), X86_MATCH(INTEL_XEON_PHI_KNM), {} }; static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = { X86_MATCH(INTEL_SKYLAKE_X), {} }; static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = { X86_MATCH(INTEL_ATOM_GOLDMONT), X86_MATCH(INTEL_ATOM_GOLDMONT_D), X86_MATCH(INTEL_ATOM_GOLDMONT_PLUS), {} }; static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int num_delta_fratio) { int fratio, delta_fratio, found; int err, i; u64 msr; err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); if (err) return false; *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); if (err) return false; fratio = (msr >> 8) & 0xFF; i = 16; found = 0; do { if (found >= num_delta_fratio) { *turbo_freq = fratio; return true; } delta_fratio = (msr >> (i + 5)) & 0x7; if (delta_fratio) { found += 1; fratio -= delta_fratio; } i += 8; } while (i < 64); return true; } static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size) { u64 ratios, counts; u32 group_size; int err, i; err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); if (err) return false; *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios); if (err) return false; err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts); if (err) return false; for (i = 0; i < 64; i += 8) { group_size = (counts >> i) & 0xFF; if (group_size >= size) { *turbo_freq = (ratios >> i) & 0xFF; return true; } } return false; } static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) { u64 msr; int err; err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); if (err) return false; err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); if (err) return false; *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */ /* The CPU may have less than 4 cores */ if (!*turbo_freq) *turbo_freq = msr & 0xFF; /* 1C turbo */ return true; } static bool __init intel_set_max_freq_ratio(void) { u64 base_freq, turbo_freq; u64 turbo_ratio; if (slv_set_max_freq_ratio(&base_freq, &turbo_freq)) goto out; if (x86_match_cpu(has_glm_turbo_ratio_limits) && skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) goto out; if (x86_match_cpu(has_knl_turbo_ratio_limits) && knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) goto out; if (x86_match_cpu(has_skx_turbo_ratio_limits) && skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4)) goto out; if (core_set_max_freq_ratio(&base_freq, &turbo_freq)) goto out; return false; out: /* * Some hypervisors advertise X86_FEATURE_APERFMPERF * but then fill all MSR's with zeroes. * Some CPUs have turbo boost but don't declare any turbo ratio * in MSR_TURBO_RATIO_LIMIT. */ if (!base_freq || !turbo_freq) { pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n"); return false; } turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq); if (!turbo_ratio) { pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n"); return false; } arch_turbo_freq_ratio = turbo_ratio; arch_set_max_freq_ratio(turbo_disabled()); return true; } #ifdef CONFIG_PM_SLEEP static struct syscore_ops freq_invariance_syscore_ops = { .resume = init_counter_refs, }; static void register_freq_invariance_syscore_ops(void) { register_syscore_ops(&freq_invariance_syscore_ops); } #else static inline void register_freq_invariance_syscore_ops(void) {} #endif static void freq_invariance_enable(void) { if (static_branch_unlikely(&arch_scale_freq_key)) { WARN_ON_ONCE(1); return; } static_branch_enable_cpuslocked(&arch_scale_freq_key); register_freq_invariance_syscore_ops(); pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); } void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) { arch_turbo_freq_ratio = ratio; arch_set_max_freq_ratio(turbo_disabled); freq_invariance_enable(); } static void __init bp_init_freq_invariance(void) { if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return; if (intel_set_max_freq_ratio()) { guard(cpus_read_lock)(); freq_invariance_enable(); } } static void disable_freq_invariance_workfn(struct work_struct *work) { int cpu; static_branch_disable(&arch_scale_freq_key); /* * Set arch_freq_scale to a default value on all cpus * This negates the effect of scaling */ for_each_possible_cpu(cpu) per_cpu(arch_freq_scale, cpu) = SCHED_CAPACITY_SCALE; } static DECLARE_WORK(disable_freq_invariance_work, disable_freq_invariance_workfn); DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale); static DEFINE_STATIC_KEY_FALSE(arch_hybrid_cap_scale_key); struct arch_hybrid_cpu_scale { unsigned long capacity; unsigned long freq_ratio; }; static struct arch_hybrid_cpu_scale __percpu *arch_cpu_scale; /** * arch_enable_hybrid_capacity_scale() - Enable hybrid CPU capacity scaling * * Allocate memory for per-CPU data used by hybrid CPU capacity scaling, * initialize it and set the static key controlling its code paths. * * Must be called before arch_set_cpu_capacity(). */ bool arch_enable_hybrid_capacity_scale(void) { int cpu; if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) { WARN_ONCE(1, "Hybrid CPU capacity scaling already enabled"); return true; } arch_cpu_scale = alloc_percpu(struct arch_hybrid_cpu_scale); if (!arch_cpu_scale) return false; for_each_possible_cpu(cpu) { per_cpu_ptr(arch_cpu_scale, cpu)->capacity = SCHED_CAPACITY_SCALE; per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio = arch_max_freq_ratio; } static_branch_enable(&arch_hybrid_cap_scale_key); pr_info("Hybrid CPU capacity scaling enabled\n"); return true; } /** * arch_set_cpu_capacity() - Set scale-invariance parameters for a CPU * @cpu: Target CPU. * @cap: Capacity of @cpu at its maximum frequency, relative to @max_cap. * @max_cap: System-wide maximum CPU capacity. * @cap_freq: Frequency of @cpu corresponding to @cap. * @base_freq: Frequency of @cpu at which MPERF counts. * * The units in which @cap and @max_cap are expressed do not matter, so long * as they are consistent, because the former is effectively divided by the * latter. Analogously for @cap_freq and @base_freq. * * After calling this function for all CPUs, call arch_rebuild_sched_domains() * to let the scheduler know that capacity-aware scheduling can be used going * forward. */ void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap, unsigned long cap_freq, unsigned long base_freq) { if (static_branch_likely(&arch_hybrid_cap_scale_key)) { WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity, div_u64(cap << SCHED_CAPACITY_SHIFT, max_cap)); WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio, div_u64(cap_freq << SCHED_CAPACITY_SHIFT, base_freq)); } else { WARN_ONCE(1, "Hybrid CPU capacity scaling not enabled"); } } unsigned long arch_scale_cpu_capacity(int cpu) { if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) return READ_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity); return SCHED_CAPACITY_SCALE; } EXPORT_SYMBOL_GPL(arch_scale_cpu_capacity); static void scale_freq_tick(u64 acnt, u64 mcnt) { u64 freq_scale, freq_ratio; if (!arch_scale_freq_invariant()) return; if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) goto error; if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) freq_ratio = READ_ONCE(this_cpu_ptr(arch_cpu_scale)->freq_ratio); else freq_ratio = arch_max_freq_ratio; if (check_mul_overflow(mcnt, freq_ratio, &mcnt) || !mcnt) goto error; freq_scale = div64_u64(acnt, mcnt); if (!freq_scale) goto error; if (freq_scale > SCHED_CAPACITY_SCALE) freq_scale = SCHED_CAPACITY_SCALE; this_cpu_write(arch_freq_scale, freq_scale); return; error: pr_warn("Scheduler frequency invariance went wobbly, disabling!\n"); schedule_work(&disable_freq_invariance_work); } #else static inline void bp_init_freq_invariance(void) { } static inline void scale_freq_tick(u64 acnt, u64 mcnt) { } #endif /* CONFIG_X86_64 && CONFIG_SMP */ void arch_scale_freq_tick(void) { struct aperfmperf *s = this_cpu_ptr(&cpu_samples); u64 acnt, mcnt, aperf, mperf; if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) return; rdmsrl(MSR_IA32_APERF, aperf); rdmsrl(MSR_IA32_MPERF, mperf); acnt = aperf - s->aperf; mcnt = mperf - s->mperf; s->aperf = aperf; s->mperf = mperf; raw_write_seqcount_begin(&s->seq); s->last_update = jiffies; s->acnt = acnt; s->mcnt = mcnt; raw_write_seqcount_end(&s->seq); scale_freq_tick(acnt, mcnt); } /* * Discard samples older than the define maximum sample age of 20ms. There * is no point in sending IPIs in such a case. If the scheduler tick was * not running then the CPU is either idle or isolated. */ #define MAX_SAMPLE_AGE ((unsigned long)HZ / 50) unsigned int arch_freq_get_on_cpu(int cpu) { struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu); unsigned int seq, freq; unsigned long last; u64 acnt, mcnt; if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) goto fallback; do { seq = raw_read_seqcount_begin(&s->seq); last = s->last_update; acnt = s->acnt; mcnt = s->mcnt; } while (read_seqcount_retry(&s->seq, seq)); /* * Bail on invalid count and when the last update was too long ago, * which covers idle and NOHZ full CPUs. */ if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE) goto fallback; return div64_u64((cpu_khz * acnt), mcnt); fallback: freq = cpufreq_quick_get(cpu); return freq ? freq : cpu_khz; } static int __init bp_init_aperfmperf(void) { if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) return 0; init_counter_refs(); bp_init_freq_invariance(); return 0; } early_initcall(bp_init_aperfmperf); void ap_init_aperfmperf(void) { if (cpu_feature_enabled(X86_FEATURE_APERFMPERF)) init_counter_refs(); } |
| 8 82 82 82 82 8 5 5 3 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 | // SPDX-License-Identifier: GPL-2.0 /* * fs/partitions/amiga.c * * Code extracted from drivers/block/genhd.c * * Copyright (C) 1991-1998 Linus Torvalds * Re-organised Feb 1998 Russell King */ #define pr_fmt(fmt) fmt #include <linux/types.h> #include <linux/mm_types.h> #include <linux/overflow.h> #include <linux/affs_hardblocks.h> #include "check.h" /* magic offsets in partition DosEnvVec */ #define NR_HD 3 #define NR_SECT 5 #define LO_CYL 9 #define HI_CYL 10 static __inline__ u32 checksum_block(__be32 *m, int size) { u32 sum = 0; while (size--) sum += be32_to_cpu(*m++); return sum; } int amiga_partition(struct parsed_partitions *state) { Sector sect; unsigned char *data; struct RigidDiskBlock *rdb; struct PartitionBlock *pb; u64 start_sect, nr_sects; sector_t blk, end_sect; u32 cylblk; /* rdb_CylBlocks = nr_heads*sect_per_track */ u32 nr_hd, nr_sect, lo_cyl, hi_cyl; int part, res = 0; unsigned int blksize = 1; /* Multiplier for disk block size */ int slot = 1; for (blk = 0; ; blk++, put_dev_sector(sect)) { if (blk == RDB_ALLOCATION_LIMIT) goto rdb_done; data = read_part_sector(state, blk, §); if (!data) { pr_err("Dev %s: unable to read RDB block %llu\n", state->disk->disk_name, blk); res = -1; goto rdb_done; } if (*(__be32 *)data != cpu_to_be32(IDNAME_RIGIDDISK)) continue; rdb = (struct RigidDiskBlock *)data; if (checksum_block((__be32 *)data, be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F) == 0) break; /* Try again with 0xdc..0xdf zeroed, Windows might have * trashed it. */ *(__be32 *)(data+0xdc) = 0; if (checksum_block((__be32 *)data, be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F)==0) { pr_err("Trashed word at 0xd0 in block %llu ignored in checksum calculation\n", blk); break; } pr_err("Dev %s: RDB in block %llu has bad checksum\n", state->disk->disk_name, blk); } /* blksize is blocks per 512 byte standard block */ blksize = be32_to_cpu( rdb->rdb_BlockBytes ) / 512; { char tmp[7 + 10 + 1 + 1]; /* Be more informative */ snprintf(tmp, sizeof(tmp), " RDSK (%d)", blksize * 512); strlcat(state->pp_buf, tmp, PAGE_SIZE); } blk = be32_to_cpu(rdb->rdb_PartitionList); put_dev_sector(sect); for (part = 1; (s32) blk>0 && part<=16; part++, put_dev_sector(sect)) { /* Read in terms partition table understands */ if (check_mul_overflow(blk, (sector_t) blksize, &blk)) { pr_err("Dev %s: overflow calculating partition block %llu! Skipping partitions %u and beyond\n", state->disk->disk_name, blk, part); break; } data = read_part_sector(state, blk, §); if (!data) { pr_err("Dev %s: unable to read partition block %llu\n", state->disk->disk_name, blk); res = -1; goto rdb_done; } pb = (struct PartitionBlock *)data; blk = be32_to_cpu(pb->pb_Next); if (pb->pb_ID != cpu_to_be32(IDNAME_PARTITION)) continue; if (checksum_block((__be32 *)pb, be32_to_cpu(pb->pb_SummedLongs) & 0x7F) != 0 ) continue; /* RDB gives us more than enough rope to hang ourselves with, * many times over (2^128 bytes if all fields max out). * Some careful checks are in order, so check for potential * overflows. * We are multiplying four 32 bit numbers to one sector_t! */ nr_hd = be32_to_cpu(pb->pb_Environment[NR_HD]); nr_sect = be32_to_cpu(pb->pb_Environment[NR_SECT]); /* CylBlocks is total number of blocks per cylinder */ if (check_mul_overflow(nr_hd, nr_sect, &cylblk)) { pr_err("Dev %s: heads*sects %u overflows u32, skipping partition!\n", state->disk->disk_name, cylblk); continue; } /* check for consistency with RDB defined CylBlocks */ if (cylblk > be32_to_cpu(rdb->rdb_CylBlocks)) { pr_warn("Dev %s: cylblk %u > rdb_CylBlocks %u!\n", state->disk->disk_name, cylblk, be32_to_cpu(rdb->rdb_CylBlocks)); } /* RDB allows for variable logical block size - * normalize to 512 byte blocks and check result. */ if (check_mul_overflow(cylblk, blksize, &cylblk)) { pr_err("Dev %s: partition %u bytes per cyl. overflows u32, skipping partition!\n", state->disk->disk_name, part); continue; } /* Calculate partition start and end. Limit of 32 bit on cylblk * guarantees no overflow occurs if LBD support is enabled. */ lo_cyl = be32_to_cpu(pb->pb_Environment[LO_CYL]); start_sect = ((u64) lo_cyl * cylblk); hi_cyl = be32_to_cpu(pb->pb_Environment[HI_CYL]); nr_sects = (((u64) hi_cyl - lo_cyl + 1) * cylblk); if (!nr_sects) continue; /* Warn user if partition end overflows u32 (AmigaDOS limit) */ if ((start_sect + nr_sects) > UINT_MAX) { pr_warn("Dev %s: partition %u (%llu-%llu) needs 64 bit device support!\n", state->disk->disk_name, part, start_sect, start_sect + nr_sects); } if (check_add_overflow(start_sect, nr_sects, &end_sect)) { pr_err("Dev %s: partition %u (%llu-%llu) needs LBD device support, skipping partition!\n", state->disk->disk_name, part, start_sect, end_sect); continue; } /* Tell Kernel about it */ put_partition(state,slot++,start_sect,nr_sects); { /* Be even more informative to aid mounting */ char dostype[4]; char tmp[42]; __be32 *dt = (__be32 *)dostype; *dt = pb->pb_Environment[16]; if (dostype[3] < ' ') snprintf(tmp, sizeof(tmp), " (%c%c%c^%c)", dostype[0], dostype[1], dostype[2], dostype[3] + '@' ); else snprintf(tmp, sizeof(tmp), " (%c%c%c%c)", dostype[0], dostype[1], dostype[2], dostype[3]); strlcat(state->pp_buf, tmp, PAGE_SIZE); snprintf(tmp, sizeof(tmp), "(res %d spb %d)", be32_to_cpu(pb->pb_Environment[6]), be32_to_cpu(pb->pb_Environment[4])); strlcat(state->pp_buf, tmp, PAGE_SIZE); } res = 1; } strlcat(state->pp_buf, "\n", PAGE_SIZE); rdb_done: return res; } |
| 2 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 | // SPDX-License-Identifier: GPL-2.0-or-later /* * A generic kernel FIFO implementation * * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net> */ #include <linux/dma-mapping.h> #include <linux/err.h> #include <linux/export.h> #include <linux/kfifo.h> #include <linux/log2.h> #include <linux/scatterlist.h> #include <linux/slab.h> #include <linux/uaccess.h> /* * internal helper to calculate the unused elements in a fifo */ static inline unsigned int kfifo_unused(struct __kfifo *fifo) { return (fifo->mask + 1) - (fifo->in - fifo->out); } int __kfifo_alloc(struct __kfifo *fifo, unsigned int size, size_t esize, gfp_t gfp_mask) { /* * round up to the next power of 2, since our 'let the indices * wrap' technique works only in this case. */ size = roundup_pow_of_two(size); fifo->in = 0; fifo->out = 0; fifo->esize = esize; if (size < 2) { fifo->data = NULL; fifo->mask = 0; return -EINVAL; } fifo->data = kmalloc_array(esize, size, gfp_mask); if (!fifo->data) { fifo->mask = 0; return -ENOMEM; } fifo->mask = size - 1; return 0; } EXPORT_SYMBOL(__kfifo_alloc); void __kfifo_free(struct __kfifo *fifo) { kfree(fifo->data); fifo->in = 0; fifo->out = 0; fifo->esize = 0; fifo->data = NULL; fifo->mask = 0; } EXPORT_SYMBOL(__kfifo_free); int __kfifo_init(struct __kfifo *fifo, void *buffer, unsigned int size, size_t esize) { size /= esize; if (!is_power_of_2(size)) size = rounddown_pow_of_two(size); fifo->in = 0; fifo->out = 0; fifo->esize = esize; fifo->data = buffer; if (size < 2) { fifo->mask = 0; return -EINVAL; } fifo->mask = size - 1; return 0; } EXPORT_SYMBOL(__kfifo_init); static void kfifo_copy_in(struct __kfifo *fifo, const void *src, unsigned int len, unsigned int off) { unsigned int size = fifo->mask + 1; unsigned int esize = fifo->esize; unsigned int l; off &= fifo->mask; if (esize != 1) { off *= esize; size *= esize; len *= esize; } l = min(len, size - off); memcpy(fifo->data + off, src, l); memcpy(fifo->data, src + l, len - l); /* * make sure that the data in the fifo is up to date before * incrementing the fifo->in index counter */ smp_wmb(); } unsigned int __kfifo_in(struct __kfifo *fifo, const void *buf, unsigned int len) { unsigned int l; l = kfifo_unused(fifo); if (len > l) len = l; kfifo_copy_in(fifo, buf, len, fifo->in); fifo->in += len; return len; } EXPORT_SYMBOL(__kfifo_in); static void kfifo_copy_out(struct __kfifo *fifo, void *dst, unsigned int len, unsigned int off) { unsigned int size = fifo->mask + 1; unsigned int esize = fifo->esize; unsigned int l; off &= fifo->mask; if (esize != 1) { off *= esize; size *= esize; len *= esize; } l = min(len, size - off); memcpy(dst, fifo->data + off, l); memcpy(dst + l, fifo->data, len - l); /* * make sure that the data is copied before * incrementing the fifo->out index counter */ smp_wmb(); } unsigned int __kfifo_out_peek(struct __kfifo *fifo, void *buf, unsigned int len) { unsigned int l; l = fifo->in - fifo->out; if (len > l) len = l; kfifo_copy_out(fifo, buf, len, fifo->out); return len; } EXPORT_SYMBOL(__kfifo_out_peek); unsigned int __kfifo_out_linear(struct __kfifo *fifo, unsigned int *tail, unsigned int n) { unsigned int size = fifo->mask + 1; unsigned int off = fifo->out & fifo->mask; if (tail) *tail = off; return min3(n, fifo->in - fifo->out, size - off); } EXPORT_SYMBOL(__kfifo_out_linear); unsigned int __kfifo_out(struct __kfifo *fifo, void *buf, unsigned int len) { len = __kfifo_out_peek(fifo, buf, len); fifo->out += len; return len; } EXPORT_SYMBOL(__kfifo_out); static unsigned long kfifo_copy_from_user(struct __kfifo *fifo, const void __user *from, unsigned int len, unsigned int off, unsigned int *copied) { unsigned int size = fifo->mask + 1; unsigned int esize = fifo->esize; unsigned int l; unsigned long ret; off &= fifo->mask; if (esize != 1) { off *= esize; size *= esize; len *= esize; } l = min(len, size - off); ret = copy_from_user(fifo->data + off, from, l); if (unlikely(ret)) ret = DIV_ROUND_UP(ret + len - l, esize); else { ret = copy_from_user(fifo->data, from + l, len - l); if (unlikely(ret)) ret = DIV_ROUND_UP(ret, esize); } /* * make sure that the data in the fifo is up to date before * incrementing the fifo->in index counter */ smp_wmb(); *copied = len - ret * esize; /* return the number of elements which are not copied */ return ret; } int __kfifo_from_user(struct __kfifo *fifo, const void __user *from, unsigned long len, unsigned int *copied) { unsigned int l; unsigned long ret; unsigned int esize = fifo->esize; int err; if (esize != 1) len /= esize; l = kfifo_unused(fifo); if (len > l) len = l; ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied); if (unlikely(ret)) { len -= ret; err = -EFAULT; } else err = 0; fifo->in += len; return err; } EXPORT_SYMBOL(__kfifo_from_user); static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to, unsigned int len, unsigned int off, unsigned int *copied) { unsigned int l; unsigned long ret; unsigned int size = fifo->mask + 1; unsigned int esize = fifo->esize; off &= fifo->mask; if (esize != 1) { off *= esize; size *= esize; len *= esize; } l = min(len, size - off); ret = copy_to_user(to, fifo->data + off, l); if (unlikely(ret)) ret = DIV_ROUND_UP(ret + len - l, esize); else { ret = copy_to_user(to + l, fifo->data, len - l); if (unlikely(ret)) ret = DIV_ROUND_UP(ret, esize); } /* * make sure that the data is copied before * incrementing the fifo->out index counter */ smp_wmb(); *copied = len - ret * esize; /* return the number of elements which are not copied */ return ret; } int __kfifo_to_user(struct __kfifo *fifo, void __user *to, unsigned long len, unsigned int *copied) { unsigned int l; unsigned long ret; unsigned int esize = fifo->esize; int err; if (esize != 1) len /= esize; l = fifo->in - fifo->out; if (len > l) len = l; ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied); if (unlikely(ret)) { len -= ret; err = -EFAULT; } else err = 0; fifo->out += len; return err; } EXPORT_SYMBOL(__kfifo_to_user); static unsigned int setup_sgl_buf(struct __kfifo *fifo, struct scatterlist *sgl, unsigned int data_offset, int nents, unsigned int len, dma_addr_t dma) { const void *buf = fifo->data + data_offset; if (!nents || !len) return 0; sg_set_buf(sgl, buf, len); if (dma != DMA_MAPPING_ERROR) { sg_dma_address(sgl) = dma + data_offset; sg_dma_len(sgl) = len; } return 1; } static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl, int nents, unsigned int len, unsigned int off, dma_addr_t dma) { unsigned int size = fifo->mask + 1; unsigned int esize = fifo->esize; unsigned int len_to_end; unsigned int n; off &= fifo->mask; if (esize != 1) { off *= esize; size *= esize; len *= esize; } len_to_end = min(len, size - off); n = setup_sgl_buf(fifo, sgl, off, nents, len_to_end, dma); n += setup_sgl_buf(fifo, sgl + n, 0, nents - n, len - len_to_end, dma); return n; } unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo, struct scatterlist *sgl, int nents, unsigned int len, dma_addr_t dma) { unsigned int l; l = kfifo_unused(fifo); if (len > l) len = l; return setup_sgl(fifo, sgl, nents, len, fifo->in, dma); } EXPORT_SYMBOL(__kfifo_dma_in_prepare); unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo, struct scatterlist *sgl, int nents, unsigned int len, dma_addr_t dma) { unsigned int l; l = fifo->in - fifo->out; if (len > l) len = l; return setup_sgl(fifo, sgl, nents, len, fifo->out, dma); } EXPORT_SYMBOL(__kfifo_dma_out_prepare); unsigned int __kfifo_max_r(unsigned int len, size_t recsize) { unsigned int max = (1 << (recsize << 3)) - 1; if (len > max) return max; return len; } EXPORT_SYMBOL(__kfifo_max_r); #define __KFIFO_PEEK(data, out, mask) \ ((data)[(out) & (mask)]) /* * __kfifo_peek_n internal helper function for determinate the length of * the next record in the fifo */ static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize) { unsigned int l; unsigned int mask = fifo->mask; unsigned char *data = fifo->data; l = __KFIFO_PEEK(data, fifo->out, mask); if (--recsize) l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8; return l; } #define __KFIFO_POKE(data, in, mask, val) \ ( \ (data)[(in) & (mask)] = (unsigned char)(val) \ ) /* * __kfifo_poke_n internal helper function for storing the length of * the record into the fifo */ static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize) { unsigned int mask = fifo->mask; unsigned char *data = fifo->data; __KFIFO_POKE(data, fifo->in, mask, n); if (recsize > 1) __KFIFO_POKE(data, fifo->in + 1, mask, n >> 8); } unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize) { return __kfifo_peek_n(fifo, recsize); } EXPORT_SYMBOL(__kfifo_len_r); unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf, unsigned int len, size_t recsize) { if (len + recsize > kfifo_unused(fifo)) return 0; __kfifo_poke_n(fifo, len, recsize); kfifo_copy_in(fifo, buf, len, fifo->in + recsize); fifo->in += len + recsize; return len; } EXPORT_SYMBOL(__kfifo_in_r); static unsigned int kfifo_out_copy_r(struct __kfifo *fifo, void *buf, unsigned int len, size_t recsize, unsigned int *n) { *n = __kfifo_peek_n(fifo, recsize); if (len > *n) len = *n; kfifo_copy_out(fifo, buf, len, fifo->out + recsize); return len; } unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf, unsigned int len, size_t recsize) { unsigned int n; if (fifo->in == fifo->out) return 0; return kfifo_out_copy_r(fifo, buf, len, recsize, &n); } EXPORT_SYMBOL(__kfifo_out_peek_r); unsigned int __kfifo_out_linear_r(struct __kfifo *fifo, unsigned int *tail, unsigned int n, size_t recsize) { if (fifo->in == fifo->out) return 0; if (tail) *tail = fifo->out + recsize; return min(n, __kfifo_peek_n(fifo, recsize)); } EXPORT_SYMBOL(__kfifo_out_linear_r); unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf, unsigned int len, size_t recsize) { unsigned int n; if (fifo->in == fifo->out) return 0; len = kfifo_out_copy_r(fifo, buf, len, recsize, &n); fifo->out += n + recsize; return len; } EXPORT_SYMBOL(__kfifo_out_r); void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize) { unsigned int n; n = __kfifo_peek_n(fifo, recsize); fifo->out += n + recsize; } EXPORT_SYMBOL(__kfifo_skip_r); int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from, unsigned long len, unsigned int *copied, size_t recsize) { unsigned long ret; len = __kfifo_max_r(len, recsize); if (len + recsize > kfifo_unused(fifo)) { *copied = 0; return 0; } __kfifo_poke_n(fifo, len, recsize); ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied); if (unlikely(ret)) { *copied = 0; return -EFAULT; } fifo->in += len + recsize; return 0; } EXPORT_SYMBOL(__kfifo_from_user_r); int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to, unsigned long len, unsigned int *copied, size_t recsize) { unsigned long ret; unsigned int n; if (fifo->in == fifo->out) { *copied = 0; return 0; } n = __kfifo_peek_n(fifo, recsize); if (len > n) len = n; ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied); if (unlikely(ret)) { *copied = 0; return -EFAULT; } fifo->out += n + recsize; return 0; } EXPORT_SYMBOL(__kfifo_to_user_r); unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo, struct scatterlist *sgl, int nents, unsigned int len, size_t recsize, dma_addr_t dma) { BUG_ON(!nents); len = __kfifo_max_r(len, recsize); if (len + recsize > kfifo_unused(fifo)) return 0; return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize, dma); } EXPORT_SYMBOL(__kfifo_dma_in_prepare_r); void __kfifo_dma_in_finish_r(struct __kfifo *fifo, unsigned int len, size_t recsize) { len = __kfifo_max_r(len, recsize); __kfifo_poke_n(fifo, len, recsize); fifo->in += len + recsize; } EXPORT_SYMBOL(__kfifo_dma_in_finish_r); unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo, struct scatterlist *sgl, int nents, unsigned int len, size_t recsize, dma_addr_t dma) { BUG_ON(!nents); len = __kfifo_max_r(len, recsize); if (len + recsize > fifo->in - fifo->out) return 0; return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize, dma); } EXPORT_SYMBOL(__kfifo_dma_out_prepare_r); |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_NOSPEC_BRANCH_H_ #define _ASM_X86_NOSPEC_BRANCH_H_ #include <linux/static_key.h> #include <linux/objtool.h> #include <linux/linkage.h> #include <asm/alternative.h> #include <asm/cpufeatures.h> #include <asm/msr-index.h> #include <asm/unwind_hints.h> #include <asm/percpu.h> #include <asm/current.h> /* * Call depth tracking for Intel SKL CPUs to address the RSB underflow * issue in software. * * The tracking does not use a counter. It uses uses arithmetic shift * right on call entry and logical shift left on return. * * The depth tracking variable is initialized to 0x8000.... when the call * depth is zero. The arithmetic shift right sign extends the MSB and * saturates after the 12th call. The shift count is 5 for both directions * so the tracking covers 12 nested calls. * * Call * 0: 0x8000000000000000 0x0000000000000000 * 1: 0xfc00000000000000 0xf000000000000000 * ... * 11: 0xfffffffffffffff8 0xfffffffffffffc00 * 12: 0xffffffffffffffff 0xffffffffffffffe0 * * After a return buffer fill the depth is credited 12 calls before the * next stuffing has to take place. * * There is a inaccuracy for situations like this: * * 10 calls * 5 returns * 3 calls * 4 returns * 3 calls * .... * * The shift count might cause this to be off by one in either direction, * but there is still a cushion vs. the RSB depth. The algorithm does not * claim to be perfect and it can be speculated around by the CPU, but it * is considered that it obfuscates the problem enough to make exploitation * extremely difficult. */ #define RET_DEPTH_SHIFT 5 #define RSB_RET_STUFF_LOOPS 16 #define RET_DEPTH_INIT 0x8000000000000000ULL #define RET_DEPTH_INIT_FROM_CALL 0xfc00000000000000ULL #define RET_DEPTH_CREDIT 0xffffffffffffffffULL #ifdef CONFIG_CALL_THUNKS_DEBUG # define CALL_THUNKS_DEBUG_INC_CALLS \ incq PER_CPU_VAR(__x86_call_count); # define CALL_THUNKS_DEBUG_INC_RETS \ incq PER_CPU_VAR(__x86_ret_count); # define CALL_THUNKS_DEBUG_INC_STUFFS \ incq PER_CPU_VAR(__x86_stuffs_count); # define CALL_THUNKS_DEBUG_INC_CTXSW \ incq PER_CPU_VAR(__x86_ctxsw_count); #else # define CALL_THUNKS_DEBUG_INC_CALLS # define CALL_THUNKS_DEBUG_INC_RETS # define CALL_THUNKS_DEBUG_INC_STUFFS # define CALL_THUNKS_DEBUG_INC_CTXSW #endif #if defined(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) && !defined(COMPILE_OFFSETS) #include <asm/asm-offsets.h> #define CREDIT_CALL_DEPTH \ movq $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth); #define RESET_CALL_DEPTH \ xor %eax, %eax; \ bts $63, %rax; \ movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth); #define RESET_CALL_DEPTH_FROM_CALL \ movb $0xfc, %al; \ shl $56, %rax; \ movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth); \ CALL_THUNKS_DEBUG_INC_CALLS #define INCREMENT_CALL_DEPTH \ sarq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth); \ CALL_THUNKS_DEBUG_INC_CALLS #else #define CREDIT_CALL_DEPTH #define RESET_CALL_DEPTH #define RESET_CALL_DEPTH_FROM_CALL #define INCREMENT_CALL_DEPTH #endif /* * Fill the CPU return stack buffer. * * Each entry in the RSB, if used for a speculative 'ret', contains an * infinite 'pause; lfence; jmp' loop to capture speculative execution. * * This is required in various cases for retpoline and IBRS-based * mitigations for the Spectre variant 2 vulnerability. Sometimes to * eliminate potentially bogus entries from the RSB, and sometimes * purely to ensure that it doesn't get empty, which on some CPUs would * allow predictions from other (unwanted!) sources to be used. * * We define a CPP macro such that it can be used from both .S files and * inline assembly. It's possible to do a .macro and then include that * from C via asm(".include <asm/nospec-branch.h>") but let's not go there. */ #define RETPOLINE_THUNK_SIZE 32 #define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */ /* * Common helper for __FILL_RETURN_BUFFER and __FILL_ONE_RETURN. */ #define __FILL_RETURN_SLOT \ ANNOTATE_INTRA_FUNCTION_CALL; \ call 772f; \ int3; \ 772: /* * Stuff the entire RSB. * * Google experimented with loop-unrolling and this turned out to be * the optimal version - two calls, each with their own speculation * trap should their return address end up getting used, in a loop. */ #ifdef CONFIG_X86_64 #define __FILL_RETURN_BUFFER(reg, nr) \ mov $(nr/2), reg; \ 771: \ __FILL_RETURN_SLOT \ __FILL_RETURN_SLOT \ add $(BITS_PER_LONG/8) * 2, %_ASM_SP; \ dec reg; \ jnz 771b; \ /* barrier for jnz misprediction */ \ lfence; \ CREDIT_CALL_DEPTH \ CALL_THUNKS_DEBUG_INC_CTXSW #else /* * i386 doesn't unconditionally have LFENCE, as such it can't * do a loop. */ #define __FILL_RETURN_BUFFER(reg, nr) \ .rept nr; \ __FILL_RETURN_SLOT; \ .endr; \ add $(BITS_PER_LONG/8) * nr, %_ASM_SP; #endif /* * Stuff a single RSB slot. * * To mitigate Post-Barrier RSB speculation, one CALL instruction must be * forced to retire before letting a RET instruction execute. * * On PBRSB-vulnerable CPUs, it is not safe for a RET to be executed * before this point. */ #define __FILL_ONE_RETURN \ __FILL_RETURN_SLOT \ add $(BITS_PER_LONG/8), %_ASM_SP; \ lfence; #ifdef __ASSEMBLY__ /* * This should be used immediately before an indirect jump/call. It tells * objtool the subsequent indirect jump/call is vouched safe for retpoline * builds. */ .macro ANNOTATE_RETPOLINE_SAFE .Lhere_\@: .pushsection .discard.retpoline_safe .long .Lhere_\@ .popsection .endm /* * (ab)use RETPOLINE_SAFE on RET to annotate away 'bare' RET instructions * vs RETBleed validation. */ #define ANNOTATE_UNRET_SAFE ANNOTATE_RETPOLINE_SAFE /* * Abuse ANNOTATE_RETPOLINE_SAFE on a NOP to indicate UNRET_END, should * eventually turn into its own annotation. */ .macro VALIDATE_UNRET_END #if defined(CONFIG_NOINSTR_VALIDATION) && \ (defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO)) ANNOTATE_RETPOLINE_SAFE nop #endif .endm /* * Equivalent to -mindirect-branch-cs-prefix; emit the 5 byte jmp/call * to the retpoline thunk with a CS prefix when the register requires * a RAX prefix byte to encode. Also see apply_retpolines(). */ .macro __CS_PREFIX reg:req .irp rs,r8,r9,r10,r11,r12,r13,r14,r15 .ifc \reg,\rs .byte 0x2e .endif .endr .endm /* * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple * indirect jmp/call which may be susceptible to the Spectre variant 2 * attack. * * NOTE: these do not take kCFI into account and are thus not comparable to C * indirect calls, take care when using. The target of these should be an ENDBR * instruction irrespective of kCFI. */ .macro JMP_NOSPEC reg:req #ifdef CONFIG_MITIGATION_RETPOLINE __CS_PREFIX \reg jmp __x86_indirect_thunk_\reg #else jmp *%\reg int3 #endif .endm .macro CALL_NOSPEC reg:req #ifdef CONFIG_MITIGATION_RETPOLINE __CS_PREFIX \reg call __x86_indirect_thunk_\reg #else call *%\reg #endif .endm /* * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP * monstrosity above, manually. */ .macro FILL_RETURN_BUFFER reg:req nr:req ftr:req ftr2=ALT_NOT(X86_FEATURE_ALWAYS) ALTERNATIVE_2 "jmp .Lskip_rsb_\@", \ __stringify(__FILL_RETURN_BUFFER(\reg,\nr)), \ftr, \ __stringify(nop;nop;__FILL_ONE_RETURN), \ftr2 .Lskip_rsb_\@: .endm /* * The CALL to srso_alias_untrain_ret() must be patched in directly at * the spot where untraining must be done, ie., srso_alias_untrain_ret() * must be the target of a CALL instruction instead of indirectly * jumping to a wrapper which then calls it. Therefore, this macro is * called outside of __UNTRAIN_RET below, for the time being, before the * kernel can support nested alternatives with arbitrary nesting. */ .macro CALL_UNTRAIN_RET #if defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO) ALTERNATIVE_2 "", "call entry_untrain_ret", X86_FEATURE_UNRET, \ "call srso_alias_untrain_ret", X86_FEATURE_SRSO_ALIAS #endif .endm /* * Mitigate RETBleed for AMD/Hygon Zen uarch. Requires KERNEL CR3 because the * return thunk isn't mapped into the userspace tables (then again, AMD * typically has NO_MELTDOWN). * * While retbleed_untrain_ret() doesn't clobber anything but requires stack, * entry_ibpb() will clobber AX, CX, DX. * * As such, this must be placed after every *SWITCH_TO_KERNEL_CR3 at a point * where we have a stack but before any RET instruction. */ .macro __UNTRAIN_RET ibpb_feature, call_depth_insns #if defined(CONFIG_MITIGATION_RETHUNK) || defined(CONFIG_MITIGATION_IBPB_ENTRY) VALIDATE_UNRET_END CALL_UNTRAIN_RET ALTERNATIVE_2 "", \ "call entry_ibpb", \ibpb_feature, \ __stringify(\call_depth_insns), X86_FEATURE_CALL_DEPTH #endif .endm #define UNTRAIN_RET \ __UNTRAIN_RET X86_FEATURE_ENTRY_IBPB, __stringify(RESET_CALL_DEPTH) #define UNTRAIN_RET_VM \ __UNTRAIN_RET X86_FEATURE_IBPB_ON_VMEXIT, __stringify(RESET_CALL_DEPTH) #define UNTRAIN_RET_FROM_CALL \ __UNTRAIN_RET X86_FEATURE_ENTRY_IBPB, __stringify(RESET_CALL_DEPTH_FROM_CALL) .macro CALL_DEPTH_ACCOUNT #ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING ALTERNATIVE "", \ __stringify(INCREMENT_CALL_DEPTH), X86_FEATURE_CALL_DEPTH #endif .endm /* * Macro to execute VERW instruction that mitigate transient data sampling * attacks such as MDS. On affected systems a microcode update overloaded VERW * instruction to also clear the CPU buffers. VERW clobbers CFLAGS.ZF. * * Note: Only the memory operand variant of VERW clears the CPU buffers. */ .macro CLEAR_CPU_BUFFERS #ifdef CONFIG_X86_64 ALTERNATIVE "", "verw mds_verw_sel(%rip)", X86_FEATURE_CLEAR_CPU_BUF #else /* * In 32bit mode, the memory operand must be a %cs reference. The data * segments may not be usable (vm86 mode), and the stack segment may not * be flat (ESPFIX32). */ ALTERNATIVE "", "verw %cs:mds_verw_sel", X86_FEATURE_CLEAR_CPU_BUF #endif .endm #ifdef CONFIG_X86_64 .macro CLEAR_BRANCH_HISTORY ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP .endm .macro CLEAR_BRANCH_HISTORY_VMEXIT ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT .endm #else #define CLEAR_BRANCH_HISTORY #define CLEAR_BRANCH_HISTORY_VMEXIT #endif #else /* __ASSEMBLY__ */ #define ANNOTATE_RETPOLINE_SAFE \ "999:\n\t" \ ".pushsection .discard.retpoline_safe\n\t" \ ".long 999b\n\t" \ ".popsection\n\t" typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE]; extern retpoline_thunk_t __x86_indirect_thunk_array[]; extern retpoline_thunk_t __x86_indirect_call_thunk_array[]; extern retpoline_thunk_t __x86_indirect_jump_thunk_array[]; #ifdef CONFIG_MITIGATION_RETHUNK extern void __x86_return_thunk(void); #else static inline void __x86_return_thunk(void) {} #endif #ifdef CONFIG_MITIGATION_UNRET_ENTRY extern void retbleed_return_thunk(void); #else static inline void retbleed_return_thunk(void) {} #endif extern void srso_alias_untrain_ret(void); #ifdef CONFIG_MITIGATION_SRSO extern void srso_return_thunk(void); extern void srso_alias_return_thunk(void); #else static inline void srso_return_thunk(void) {} static inline void srso_alias_return_thunk(void) {} #endif extern void retbleed_return_thunk(void); extern void srso_return_thunk(void); extern void srso_alias_return_thunk(void); extern void entry_untrain_ret(void); extern void entry_ibpb(void); #ifdef CONFIG_X86_64 extern void clear_bhb_loop(void); #endif extern void (*x86_return_thunk)(void); extern void __warn_thunk(void); #ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING extern void call_depth_return_thunk(void); #define CALL_DEPTH_ACCOUNT \ ALTERNATIVE("", \ __stringify(INCREMENT_CALL_DEPTH), \ X86_FEATURE_CALL_DEPTH) #ifdef CONFIG_CALL_THUNKS_DEBUG DECLARE_PER_CPU(u64, __x86_call_count); DECLARE_PER_CPU(u64, __x86_ret_count); DECLARE_PER_CPU(u64, __x86_stuffs_count); DECLARE_PER_CPU(u64, __x86_ctxsw_count); #endif #else /* !CONFIG_MITIGATION_CALL_DEPTH_TRACKING */ static inline void call_depth_return_thunk(void) {} #define CALL_DEPTH_ACCOUNT "" #endif /* CONFIG_MITIGATION_CALL_DEPTH_TRACKING */ #ifdef CONFIG_MITIGATION_RETPOLINE #define GEN(reg) \ extern retpoline_thunk_t __x86_indirect_thunk_ ## reg; #include <asm/GEN-for-each-reg.h> #undef GEN #define GEN(reg) \ extern retpoline_thunk_t __x86_indirect_call_thunk_ ## reg; #include <asm/GEN-for-each-reg.h> #undef GEN #define GEN(reg) \ extern retpoline_thunk_t __x86_indirect_jump_thunk_ ## reg; #include <asm/GEN-for-each-reg.h> #undef GEN #ifdef CONFIG_X86_64 /* * Inline asm uses the %V modifier which is only in newer GCC * which is ensured when CONFIG_MITIGATION_RETPOLINE is defined. */ # define CALL_NOSPEC \ ALTERNATIVE_2( \ ANNOTATE_RETPOLINE_SAFE \ "call *%[thunk_target]\n", \ "call __x86_indirect_thunk_%V[thunk_target]\n", \ X86_FEATURE_RETPOLINE, \ "lfence;\n" \ ANNOTATE_RETPOLINE_SAFE \ "call *%[thunk_target]\n", \ X86_FEATURE_RETPOLINE_LFENCE) # define THUNK_TARGET(addr) [thunk_target] "r" (addr) #else /* CONFIG_X86_32 */ /* * For i386 we use the original ret-equivalent retpoline, because * otherwise we'll run out of registers. We don't care about CET * here, anyway. */ # define CALL_NOSPEC \ ALTERNATIVE_2( \ ANNOTATE_RETPOLINE_SAFE \ "call *%[thunk_target]\n", \ " jmp 904f;\n" \ " .align 16\n" \ "901: call 903f;\n" \ "902: pause;\n" \ " lfence;\n" \ " jmp 902b;\n" \ " .align 16\n" \ "903: lea 4(%%esp), %%esp;\n" \ " pushl %[thunk_target];\n" \ " ret;\n" \ " .align 16\n" \ "904: call 901b;\n", \ X86_FEATURE_RETPOLINE, \ "lfence;\n" \ ANNOTATE_RETPOLINE_SAFE \ "call *%[thunk_target]\n", \ X86_FEATURE_RETPOLINE_LFENCE) # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) #endif #else /* No retpoline for C / inline asm */ # define CALL_NOSPEC "call *%[thunk_target]\n" # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) #endif /* The Spectre V2 mitigation variants */ enum spectre_v2_mitigation { SPECTRE_V2_NONE, SPECTRE_V2_RETPOLINE, SPECTRE_V2_LFENCE, SPECTRE_V2_EIBRS, SPECTRE_V2_EIBRS_RETPOLINE, SPECTRE_V2_EIBRS_LFENCE, SPECTRE_V2_IBRS, }; /* The indirect branch speculation control variants */ enum spectre_v2_user_mitigation { SPECTRE_V2_USER_NONE, SPECTRE_V2_USER_STRICT, SPECTRE_V2_USER_STRICT_PREFERRED, SPECTRE_V2_USER_PRCTL, SPECTRE_V2_USER_SECCOMP, }; /* The Speculative Store Bypass disable variants */ enum ssb_mitigation { SPEC_STORE_BYPASS_NONE, SPEC_STORE_BYPASS_DISABLE, SPEC_STORE_BYPASS_PRCTL, SPEC_STORE_BYPASS_SECCOMP, }; static __always_inline void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature) { asm volatile(ALTERNATIVE("", "wrmsr", %c[feature]) : : "c" (msr), "a" ((u32)val), "d" ((u32)(val >> 32)), [feature] "i" (feature) : "memory"); } extern u64 x86_pred_cmd; static inline void indirect_branch_prediction_barrier(void) { alternative_msr_write(MSR_IA32_PRED_CMD, x86_pred_cmd, X86_FEATURE_USE_IBPB); } /* The Intel SPEC CTRL MSR base value cache */ extern u64 x86_spec_ctrl_base; DECLARE_PER_CPU(u64, x86_spec_ctrl_current); extern void update_spec_ctrl_cond(u64 val); extern u64 spec_ctrl_current(void); /* * With retpoline, we must use IBRS to restrict branch prediction * before calling into firmware. * * (Implemented as CPP macros due to header hell.) */ #define firmware_restrict_branch_speculation_start() \ do { \ preempt_disable(); \ alternative_msr_write(MSR_IA32_SPEC_CTRL, \ spec_ctrl_current() | SPEC_CTRL_IBRS, \ X86_FEATURE_USE_IBRS_FW); \ alternative_msr_write(MSR_IA32_PRED_CMD, PRED_CMD_IBPB, \ X86_FEATURE_USE_IBPB_FW); \ } while (0) #define firmware_restrict_branch_speculation_end() \ do { \ alternative_msr_write(MSR_IA32_SPEC_CTRL, \ spec_ctrl_current(), \ X86_FEATURE_USE_IBRS_FW); \ preempt_enable(); \ } while (0) DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp); DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb); DECLARE_STATIC_KEY_FALSE(mds_idle_clear); DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); DECLARE_STATIC_KEY_FALSE(mmio_stale_data_clear); extern u16 mds_verw_sel; #include <asm/segment.h> /** * mds_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability * * This uses the otherwise unused and obsolete VERW instruction in * combination with microcode which triggers a CPU buffer flush when the * instruction is executed. */ static __always_inline void mds_clear_cpu_buffers(void) { static const u16 ds = __KERNEL_DS; /* * Has to be the memory-operand variant because only that * guarantees the CPU buffer flush functionality according to * documentation. The register-operand variant does not. * Works with any segment selector, but a valid writable * data segment is the fastest variant. * * "cc" clobber is required because VERW modifies ZF. */ asm volatile("verw %[ds]" : : [ds] "m" (ds) : "cc"); } /** * mds_idle_clear_cpu_buffers - Mitigation for MDS vulnerability * * Clear CPU buffers if the corresponding static key is enabled */ static __always_inline void mds_idle_clear_cpu_buffers(void) { if (static_branch_likely(&mds_idle_clear)) mds_clear_cpu_buffers(); } #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_NOSPEC_BRANCH_H_ */ |
| 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 | // SPDX-License-Identifier: GPL-2.0-only /* * zpool memory storage api * * Copyright (C) 2014 Dan Streetman * * This is a common frontend for memory storage pool implementations. * Typically, this is used to store compressed memory. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/list.h> #include <linux/types.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/module.h> #include <linux/zpool.h> struct zpool { struct zpool_driver *driver; void *pool; }; static LIST_HEAD(drivers_head); static DEFINE_SPINLOCK(drivers_lock); /** * zpool_register_driver() - register a zpool implementation. * @driver: driver to register */ void zpool_register_driver(struct zpool_driver *driver) { spin_lock(&drivers_lock); atomic_set(&driver->refcount, 0); list_add(&driver->list, &drivers_head); spin_unlock(&drivers_lock); } EXPORT_SYMBOL(zpool_register_driver); /** * zpool_unregister_driver() - unregister a zpool implementation. * @driver: driver to unregister. * * Module usage counting is used to prevent using a driver * while/after unloading, so if this is called from module * exit function, this should never fail; if called from * other than the module exit function, and this returns * failure, the driver is in use and must remain available. */ int zpool_unregister_driver(struct zpool_driver *driver) { int ret = 0, refcount; spin_lock(&drivers_lock); refcount = atomic_read(&driver->refcount); WARN_ON(refcount < 0); if (refcount > 0) ret = -EBUSY; else list_del(&driver->list); spin_unlock(&drivers_lock); return ret; } EXPORT_SYMBOL(zpool_unregister_driver); /* this assumes @type is null-terminated. */ static struct zpool_driver *zpool_get_driver(const char *type) { struct zpool_driver *driver; spin_lock(&drivers_lock); list_for_each_entry(driver, &drivers_head, list) { if (!strcmp(driver->type, type)) { bool got = try_module_get(driver->owner); if (got) atomic_inc(&driver->refcount); spin_unlock(&drivers_lock); return got ? driver : NULL; } } spin_unlock(&drivers_lock); return NULL; } static void zpool_put_driver(struct zpool_driver *driver) { atomic_dec(&driver->refcount); module_put(driver->owner); } /** * zpool_has_pool() - Check if the pool driver is available * @type: The type of the zpool to check (e.g. zbud, zsmalloc) * * This checks if the @type pool driver is available. This will try to load * the requested module, if needed, but there is no guarantee the module will * still be loaded and available immediately after calling. If this returns * true, the caller should assume the pool is available, but must be prepared * to handle the @zpool_create_pool() returning failure. However if this * returns false, the caller should assume the requested pool type is not * available; either the requested pool type module does not exist, or could * not be loaded, and calling @zpool_create_pool() with the pool type will * fail. * * The @type string must be null-terminated. * * Returns: true if @type pool is available, false if not */ bool zpool_has_pool(char *type) { struct zpool_driver *driver = zpool_get_driver(type); if (!driver) { request_module("zpool-%s", type); driver = zpool_get_driver(type); } if (!driver) return false; zpool_put_driver(driver); return true; } EXPORT_SYMBOL(zpool_has_pool); /** * zpool_create_pool() - Create a new zpool * @type: The type of the zpool to create (e.g. zbud, zsmalloc) * @name: The name of the zpool (e.g. zram0, zswap) * @gfp: The GFP flags to use when allocating the pool. * * This creates a new zpool of the specified type. The gfp flags will be * used when allocating memory, if the implementation supports it. If the * ops param is NULL, then the created zpool will not be evictable. * * Implementations must guarantee this to be thread-safe. * * The @type and @name strings must be null-terminated. * * Returns: New zpool on success, NULL on failure. */ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp) { struct zpool_driver *driver; struct zpool *zpool; pr_debug("creating pool type %s\n", type); driver = zpool_get_driver(type); if (!driver) { request_module("zpool-%s", type); driver = zpool_get_driver(type); } if (!driver) { pr_err("no driver for type %s\n", type); return NULL; } zpool = kmalloc(sizeof(*zpool), gfp); if (!zpool) { pr_err("couldn't create zpool - out of memory\n"); zpool_put_driver(driver); return NULL; } zpool->driver = driver; zpool->pool = driver->create(name, gfp); if (!zpool->pool) { pr_err("couldn't create %s pool\n", type); zpool_put_driver(driver); kfree(zpool); return NULL; } pr_debug("created pool type %s\n", type); return zpool; } /** * zpool_destroy_pool() - Destroy a zpool * @zpool: The zpool to destroy. * * Implementations must guarantee this to be thread-safe, * however only when destroying different pools. The same * pool should only be destroyed once, and should not be used * after it is destroyed. * * This destroys an existing zpool. The zpool should not be in use. */ void zpool_destroy_pool(struct zpool *zpool) { pr_debug("destroying pool type %s\n", zpool->driver->type); zpool->driver->destroy(zpool->pool); zpool_put_driver(zpool->driver); kfree(zpool); } /** * zpool_get_type() - Get the type of the zpool * @zpool: The zpool to check * * This returns the type of the pool. * * Implementations must guarantee this to be thread-safe. * * Returns: The type of zpool. */ const char *zpool_get_type(struct zpool *zpool) { return zpool->driver->type; } /** * zpool_malloc_support_movable() - Check if the zpool supports * allocating movable memory * @zpool: The zpool to check * * This returns if the zpool supports allocating movable memory. * * Implementations must guarantee this to be thread-safe. * * Returns: true if the zpool supports allocating movable memory, false if not */ bool zpool_malloc_support_movable(struct zpool *zpool) { return zpool->driver->malloc_support_movable; } /** * zpool_malloc() - Allocate memory * @zpool: The zpool to allocate from. * @size: The amount of memory to allocate. * @gfp: The GFP flags to use when allocating memory. * @handle: Pointer to the handle to set * * This allocates the requested amount of memory from the pool. * The gfp flags will be used when allocating memory, if the * implementation supports it. The provided @handle will be * set to the allocated object handle. * * Implementations must guarantee this to be thread-safe. * * Returns: 0 on success, negative value on error. */ int zpool_malloc(struct zpool *zpool, size_t size, gfp_t gfp, unsigned long *handle) { return zpool->driver->malloc(zpool->pool, size, gfp, handle); } /** * zpool_free() - Free previously allocated memory * @zpool: The zpool that allocated the memory. * @handle: The handle to the memory to free. * * This frees previously allocated memory. This does not guarantee * that the pool will actually free memory, only that the memory * in the pool will become available for use by the pool. * * Implementations must guarantee this to be thread-safe, * however only when freeing different handles. The same * handle should only be freed once, and should not be used * after freeing. */ void zpool_free(struct zpool *zpool, unsigned long handle) { zpool->driver->free(zpool->pool, handle); } /** * zpool_map_handle() - Map a previously allocated handle into memory * @zpool: The zpool that the handle was allocated from * @handle: The handle to map * @mapmode: How the memory should be mapped * * This maps a previously allocated handle into memory. The @mapmode * param indicates to the implementation how the memory will be * used, i.e. read-only, write-only, read-write. If the * implementation does not support it, the memory will be treated * as read-write. * * This may hold locks, disable interrupts, and/or preemption, * and the zpool_unmap_handle() must be called to undo those * actions. The code that uses the mapped handle should complete * its operations on the mapped handle memory quickly and unmap * as soon as possible. As the implementation may use per-cpu * data, multiple handles should not be mapped concurrently on * any cpu. * * Returns: A pointer to the handle's mapped memory area. */ void *zpool_map_handle(struct zpool *zpool, unsigned long handle, enum zpool_mapmode mapmode) { return zpool->driver->map(zpool->pool, handle, mapmode); } /** * zpool_unmap_handle() - Unmap a previously mapped handle * @zpool: The zpool that the handle was allocated from * @handle: The handle to unmap * * This unmaps a previously mapped handle. Any locks or other * actions that the implementation took in zpool_map_handle() * will be undone here. The memory area returned from * zpool_map_handle() should no longer be used after this. */ void zpool_unmap_handle(struct zpool *zpool, unsigned long handle) { zpool->driver->unmap(zpool->pool, handle); } /** * zpool_get_total_pages() - The total size of the pool * @zpool: The zpool to check * * This returns the total size in pages of the pool. * * Returns: Total size of the zpool in pages. */ u64 zpool_get_total_pages(struct zpool *zpool) { return zpool->driver->total_pages(zpool->pool); } /** * zpool_can_sleep_mapped - Test if zpool can sleep when do mapped. * @zpool: The zpool to test * * Some allocators enter non-preemptible context in ->map() callback (e.g. * disable pagefaults) and exit that context in ->unmap(), which limits what * we can do with the mapped object. For instance, we cannot wait for * asynchronous crypto API to decompress such an object or take mutexes * since those will call into the scheduler. This function tells us whether * we use such an allocator. * * Returns: true if zpool can sleep; false otherwise. */ bool zpool_can_sleep_mapped(struct zpool *zpool) { return zpool->driver->sleep_mapped; } MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>"); MODULE_DESCRIPTION("Common API for compressed memory storage"); |
| 102 45 64 78 9 6 47 6 23 100 16 2 17 102 97 1 22 10 23 6 5 2 13 8 9 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 | /* * linux/fs/hfs/bitmap.c * * Copyright (C) 1996-1997 Paul H. Hargrove * (C) 2003 Ardis Technologies <roman@ardistech.com> * This file may be distributed under the terms of the GNU General Public License. * * Based on GPLed code Copyright (C) 1995 Michael Dreher * * This file contains the code to modify the volume bitmap: * search/set/clear bits. */ #include "hfs_fs.h" /* * hfs_find_zero_bit() * * Description: * Given a block of memory, its length in bits, and a starting bit number, * determine the number of the first zero bits (in left-to-right ordering) * in that range. * * Returns >= 'size' if no zero bits are found in the range. * * Accesses memory in 32-bit aligned chunks of 32-bits and thus * may read beyond the 'size'th bit. */ static u32 hfs_find_set_zero_bits(__be32 *bitmap, u32 size, u32 offset, u32 *max) { __be32 *curr, *end; u32 mask, start, len, n; __be32 val; int i; len = *max; if (!len) return size; curr = bitmap + (offset / 32); end = bitmap + ((size + 31) / 32); /* scan the first partial u32 for zero bits */ val = *curr; if (~val) { n = be32_to_cpu(val); i = offset % 32; mask = (1U << 31) >> i; for (; i < 32; mask >>= 1, i++) { if (!(n & mask)) goto found; } } /* scan complete u32s for the first zero bit */ while (++curr < end) { val = *curr; if (~val) { n = be32_to_cpu(val); mask = 1 << 31; for (i = 0; i < 32; mask >>= 1, i++) { if (!(n & mask)) goto found; } } } return size; found: start = (curr - bitmap) * 32 + i; if (start >= size) return start; /* do any partial u32 at the start */ len = min(size - start, len); while (1) { n |= mask; if (++i >= 32) break; mask >>= 1; if (!--len || n & mask) goto done; } if (!--len) goto done; *curr++ = cpu_to_be32(n); /* do full u32s */ while (1) { n = be32_to_cpu(*curr); if (len < 32) break; if (n) { len = 32; break; } *curr++ = cpu_to_be32(0xffffffff); len -= 32; } /* do any partial u32 at end */ mask = 1U << 31; for (i = 0; i < len; i++) { if (n & mask) break; n |= mask; mask >>= 1; } done: *curr = cpu_to_be32(n); *max = (curr - bitmap) * 32 + i - start; return start; } /* * hfs_vbm_search_free() * * Description: * Search for 'num_bits' consecutive cleared bits in the bitmap blocks of * the hfs MDB. 'mdb' had better be locked or the returned range * may be no longer free, when this functions returns! * XXX Currently the search starts from bit 0, but it should start with * the bit number stored in 's_alloc_ptr' of the MDB. * Input Variable(s): * struct hfs_mdb *mdb: Pointer to the hfs MDB * u16 *num_bits: Pointer to the number of cleared bits * to search for * Output Variable(s): * u16 *num_bits: The number of consecutive clear bits of the * returned range. If the bitmap is fragmented, this will be less than * requested and it will be zero, when the disk is full. * Returns: * The number of the first bit of the range of cleared bits which has been * found. When 'num_bits' is zero, this is invalid! * Preconditions: * 'mdb' points to a "valid" (struct hfs_mdb). * 'num_bits' points to a variable of type (u16), which contains * the number of cleared bits to find. * Postconditions: * 'num_bits' is set to the length of the found sequence. */ u32 hfs_vbm_search_free(struct super_block *sb, u32 goal, u32 *num_bits) { void *bitmap; u32 pos; /* make sure we have actual work to perform */ if (!*num_bits) return 0; mutex_lock(&HFS_SB(sb)->bitmap_lock); bitmap = HFS_SB(sb)->bitmap; pos = hfs_find_set_zero_bits(bitmap, HFS_SB(sb)->fs_ablocks, goal, num_bits); if (pos >= HFS_SB(sb)->fs_ablocks) { if (goal) pos = hfs_find_set_zero_bits(bitmap, goal, 0, num_bits); if (pos >= HFS_SB(sb)->fs_ablocks) { *num_bits = pos = 0; goto out; } } hfs_dbg(BITMAP, "alloc_bits: %u,%u\n", pos, *num_bits); HFS_SB(sb)->free_ablocks -= *num_bits; hfs_bitmap_dirty(sb); out: mutex_unlock(&HFS_SB(sb)->bitmap_lock); return pos; } /* * hfs_clear_vbm_bits() * * Description: * Clear the requested bits in the volume bitmap of the hfs filesystem * Input Variable(s): * struct hfs_mdb *mdb: Pointer to the hfs MDB * u16 start: The offset of the first bit * u16 count: The number of bits * Output Variable(s): * None * Returns: * 0: no error * -1: One of the bits was already clear. This is a strange * error and when it happens, the filesystem must be repaired! * -2: One or more of the bits are out of range of the bitmap. * Preconditions: * 'mdb' points to a "valid" (struct hfs_mdb). * Postconditions: * Starting with bit number 'start', 'count' bits in the volume bitmap * are cleared. The affected bitmap blocks are marked "dirty", the free * block count of the MDB is updated and the MDB is marked dirty. */ int hfs_clear_vbm_bits(struct super_block *sb, u16 start, u16 count) { __be32 *curr; u32 mask; int i, len; /* is there any actual work to be done? */ if (!count) return 0; hfs_dbg(BITMAP, "clear_bits: %u,%u\n", start, count); /* are all of the bits in range? */ if ((start + count) > HFS_SB(sb)->fs_ablocks) return -2; mutex_lock(&HFS_SB(sb)->bitmap_lock); /* bitmap is always on a 32-bit boundary */ curr = HFS_SB(sb)->bitmap + (start / 32); len = count; /* do any partial u32 at the start */ i = start % 32; if (i) { int j = 32 - i; mask = 0xffffffffU << j; if (j > count) { mask |= 0xffffffffU >> (i + count); *curr &= cpu_to_be32(mask); goto out; } *curr++ &= cpu_to_be32(mask); count -= j; } /* do full u32s */ while (count >= 32) { *curr++ = 0; count -= 32; } /* do any partial u32 at end */ if (count) { mask = 0xffffffffU >> count; *curr &= cpu_to_be32(mask); } out: HFS_SB(sb)->free_ablocks += len; mutex_unlock(&HFS_SB(sb)->bitmap_lock); hfs_bitmap_dirty(sb); return 0; } |
| 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Do sleep inside a spin-lock * Copyright (c) 1999 by Takashi Iwai <tiwai@suse.de> */ #include <linux/export.h> #include <sound/core.h> #include "seq_lock.h" /* wait until all locks are released */ void snd_use_lock_sync_helper(snd_use_lock_t *lockp, const char *file, int line) { int warn_count = 5 * HZ; if (atomic_read(lockp) < 0) { pr_warn("ALSA: seq_lock: lock trouble [counter = %d] in %s:%d\n", atomic_read(lockp), file, line); return; } while (atomic_read(lockp) > 0) { if (warn_count-- == 0) pr_warn("ALSA: seq_lock: waiting [%d left] in %s:%d\n", atomic_read(lockp), file, line); schedule_timeout_uninterruptible(1); } } EXPORT_SYMBOL(snd_use_lock_sync_helper); |
| 1 1 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/xattr_user.c * Handler for extended user attributes. * * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org> */ #include <linux/string.h> #include <linux/fs.h> #include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" static bool ext4_xattr_user_list(struct dentry *dentry) { return test_opt(dentry->d_sb, XATTR_USER); } static int ext4_xattr_user_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *buffer, size_t size) { if (!test_opt(inode->i_sb, XATTR_USER)) return -EOPNOTSUPP; return ext4_xattr_get(inode, EXT4_XATTR_INDEX_USER, name, buffer, size); } static int ext4_xattr_user_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) { if (!test_opt(inode->i_sb, XATTR_USER)) return -EOPNOTSUPP; return ext4_xattr_set(inode, EXT4_XATTR_INDEX_USER, name, value, size, flags); } const struct xattr_handler ext4_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .list = ext4_xattr_user_list, .get = ext4_xattr_user_get, .set = ext4_xattr_user_set, }; |
| 5 5 5 3 5 5 5 5 5 5 5 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 | // SPDX-License-Identifier: GPL-2.0-or-later /* * ip6_flowlabel.c IPv6 flowlabel manager. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/pid_namespace.h> #include <linux/jump_label_ratelimit.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/ipv6.h> #include <net/rawv6.h> #include <net/transp_v6.h> #include <linux/uaccess.h> #define FL_MIN_LINGER 6 /* Minimal linger. It is set to 6sec specified in old IPv6 RFC. Well, it was reasonable value. */ #define FL_MAX_LINGER 150 /* Maximal linger timeout */ /* FL hash table */ #define FL_MAX_PER_SOCK 32 #define FL_MAX_SIZE 4096 #define FL_HASH_MASK 255 #define FL_HASH(l) (ntohl(l)&FL_HASH_MASK) static atomic_t fl_size = ATOMIC_INIT(0); static struct ip6_flowlabel __rcu *fl_ht[FL_HASH_MASK+1]; static void ip6_fl_gc(struct timer_list *unused); static DEFINE_TIMER(ip6_fl_gc_timer, ip6_fl_gc); /* FL hash table lock: it protects only of GC */ static DEFINE_SPINLOCK(ip6_fl_lock); /* Big socket sock */ static DEFINE_SPINLOCK(ip6_sk_fl_lock); DEFINE_STATIC_KEY_DEFERRED_FALSE(ipv6_flowlabel_exclusive, HZ); EXPORT_SYMBOL(ipv6_flowlabel_exclusive); #define for_each_fl_rcu(hash, fl) \ for (fl = rcu_dereference(fl_ht[(hash)]); \ fl != NULL; \ fl = rcu_dereference(fl->next)) #define for_each_fl_continue_rcu(fl) \ for (fl = rcu_dereference(fl->next); \ fl != NULL; \ fl = rcu_dereference(fl->next)) #define for_each_sk_fl_rcu(np, sfl) \ for (sfl = rcu_dereference(np->ipv6_fl_list); \ sfl != NULL; \ sfl = rcu_dereference(sfl->next)) static inline struct ip6_flowlabel *__fl_lookup(struct net *net, __be32 label) { struct ip6_flowlabel *fl; for_each_fl_rcu(FL_HASH(label), fl) { if (fl->label == label && net_eq(fl->fl_net, net)) return fl; } return NULL; } static struct ip6_flowlabel *fl_lookup(struct net *net, __be32 label) { struct ip6_flowlabel *fl; rcu_read_lock(); fl = __fl_lookup(net, label); if (fl && !atomic_inc_not_zero(&fl->users)) fl = NULL; rcu_read_unlock(); return fl; } static bool fl_shared_exclusive(struct ip6_flowlabel *fl) { return fl->share == IPV6_FL_S_EXCL || fl->share == IPV6_FL_S_PROCESS || fl->share == IPV6_FL_S_USER; } static void fl_free_rcu(struct rcu_head *head) { struct ip6_flowlabel *fl = container_of(head, struct ip6_flowlabel, rcu); if (fl->share == IPV6_FL_S_PROCESS) put_pid(fl->owner.pid); kfree(fl->opt); kfree(fl); } static void fl_free(struct ip6_flowlabel *fl) { if (!fl) return; if (fl_shared_exclusive(fl) || fl->opt) static_branch_slow_dec_deferred(&ipv6_flowlabel_exclusive); call_rcu(&fl->rcu, fl_free_rcu); } static void fl_release(struct ip6_flowlabel *fl) { spin_lock_bh(&ip6_fl_lock); fl->lastuse = jiffies; if (atomic_dec_and_test(&fl->users)) { unsigned long ttd = fl->lastuse + fl->linger; if (time_after(ttd, fl->expires)) fl->expires = ttd; ttd = fl->expires; if (fl->opt && fl->share == IPV6_FL_S_EXCL) { struct ipv6_txoptions *opt = fl->opt; fl->opt = NULL; kfree(opt); } if (!timer_pending(&ip6_fl_gc_timer) || time_after(ip6_fl_gc_timer.expires, ttd)) mod_timer(&ip6_fl_gc_timer, ttd); } spin_unlock_bh(&ip6_fl_lock); } static void ip6_fl_gc(struct timer_list *unused) { int i; unsigned long now = jiffies; unsigned long sched = 0; spin_lock(&ip6_fl_lock); for (i = 0; i <= FL_HASH_MASK; i++) { struct ip6_flowlabel *fl; struct ip6_flowlabel __rcu **flp; flp = &fl_ht[i]; while ((fl = rcu_dereference_protected(*flp, lockdep_is_held(&ip6_fl_lock))) != NULL) { if (atomic_read(&fl->users) == 0) { unsigned long ttd = fl->lastuse + fl->linger; if (time_after(ttd, fl->expires)) fl->expires = ttd; ttd = fl->expires; if (time_after_eq(now, ttd)) { *flp = fl->next; fl_free(fl); atomic_dec(&fl_size); continue; } if (!sched || time_before(ttd, sched)) sched = ttd; } flp = &fl->next; } } if (!sched && atomic_read(&fl_size)) sched = now + FL_MAX_LINGER; if (sched) { mod_timer(&ip6_fl_gc_timer, sched); } spin_unlock(&ip6_fl_lock); } static void __net_exit ip6_fl_purge(struct net *net) { int i; spin_lock_bh(&ip6_fl_lock); for (i = 0; i <= FL_HASH_MASK; i++) { struct ip6_flowlabel *fl; struct ip6_flowlabel __rcu **flp; flp = &fl_ht[i]; while ((fl = rcu_dereference_protected(*flp, lockdep_is_held(&ip6_fl_lock))) != NULL) { if (net_eq(fl->fl_net, net) && atomic_read(&fl->users) == 0) { *flp = fl->next; fl_free(fl); atomic_dec(&fl_size); continue; } flp = &fl->next; } } spin_unlock_bh(&ip6_fl_lock); } static struct ip6_flowlabel *fl_intern(struct net *net, struct ip6_flowlabel *fl, __be32 label) { struct ip6_flowlabel *lfl; fl->label = label & IPV6_FLOWLABEL_MASK; rcu_read_lock(); spin_lock_bh(&ip6_fl_lock); if (label == 0) { for (;;) { fl->label = htonl(get_random_u32())&IPV6_FLOWLABEL_MASK; if (fl->label) { lfl = __fl_lookup(net, fl->label); if (!lfl) break; } } } else { /* * we dropper the ip6_fl_lock, so this entry could reappear * and we need to recheck with it. * * OTOH no need to search the active socket first, like it is * done in ipv6_flowlabel_opt - sock is locked, so new entry * with the same label can only appear on another sock */ lfl = __fl_lookup(net, fl->label); if (lfl) { atomic_inc(&lfl->users); spin_unlock_bh(&ip6_fl_lock); rcu_read_unlock(); return lfl; } } fl->lastuse = jiffies; fl->next = fl_ht[FL_HASH(fl->label)]; rcu_assign_pointer(fl_ht[FL_HASH(fl->label)], fl); atomic_inc(&fl_size); spin_unlock_bh(&ip6_fl_lock); rcu_read_unlock(); return NULL; } /* Socket flowlabel lists */ struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label) { struct ipv6_fl_socklist *sfl; struct ipv6_pinfo *np = inet6_sk(sk); label &= IPV6_FLOWLABEL_MASK; rcu_read_lock(); for_each_sk_fl_rcu(np, sfl) { struct ip6_flowlabel *fl = sfl->fl; if (fl->label == label && atomic_inc_not_zero(&fl->users)) { fl->lastuse = jiffies; rcu_read_unlock(); return fl; } } rcu_read_unlock(); return NULL; } EXPORT_SYMBOL_GPL(__fl6_sock_lookup); void fl6_free_socklist(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_fl_socklist *sfl; if (!rcu_access_pointer(np->ipv6_fl_list)) return; spin_lock_bh(&ip6_sk_fl_lock); while ((sfl = rcu_dereference_protected(np->ipv6_fl_list, lockdep_is_held(&ip6_sk_fl_lock))) != NULL) { np->ipv6_fl_list = sfl->next; spin_unlock_bh(&ip6_sk_fl_lock); fl_release(sfl->fl); kfree_rcu(sfl, rcu); spin_lock_bh(&ip6_sk_fl_lock); } spin_unlock_bh(&ip6_sk_fl_lock); } /* Service routines */ /* It is the only difficult place. flowlabel enforces equal headers before and including routing header, however user may supply options following rthdr. */ struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space, struct ip6_flowlabel *fl, struct ipv6_txoptions *fopt) { struct ipv6_txoptions *fl_opt = fl->opt; if (!fopt || fopt->opt_flen == 0) return fl_opt; if (fl_opt) { opt_space->hopopt = fl_opt->hopopt; opt_space->dst0opt = fl_opt->dst0opt; opt_space->srcrt = fl_opt->srcrt; opt_space->opt_nflen = fl_opt->opt_nflen; } else { if (fopt->opt_nflen == 0) return fopt; opt_space->hopopt = NULL; opt_space->dst0opt = NULL; opt_space->srcrt = NULL; opt_space->opt_nflen = 0; } opt_space->dst1opt = fopt->dst1opt; opt_space->opt_flen = fopt->opt_flen; opt_space->tot_len = fopt->tot_len; return opt_space; } EXPORT_SYMBOL_GPL(fl6_merge_options); static unsigned long check_linger(unsigned long ttl) { if (ttl < FL_MIN_LINGER) return FL_MIN_LINGER*HZ; if (ttl > FL_MAX_LINGER && !capable(CAP_NET_ADMIN)) return 0; return ttl*HZ; } static int fl6_renew(struct ip6_flowlabel *fl, unsigned long linger, unsigned long expires) { linger = check_linger(linger); if (!linger) return -EPERM; expires = check_linger(expires); if (!expires) return -EPERM; spin_lock_bh(&ip6_fl_lock); fl->lastuse = jiffies; if (time_before(fl->linger, linger)) fl->linger = linger; if (time_before(expires, fl->linger)) expires = fl->linger; if (time_before(fl->expires, fl->lastuse + expires)) fl->expires = fl->lastuse + expires; spin_unlock_bh(&ip6_fl_lock); return 0; } static struct ip6_flowlabel * fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, sockptr_t optval, int optlen, int *err_p) { struct ip6_flowlabel *fl = NULL; int olen; int addr_type; int err; olen = optlen - CMSG_ALIGN(sizeof(*freq)); err = -EINVAL; if (olen > 64 * 1024) goto done; err = -ENOMEM; fl = kzalloc(sizeof(*fl), GFP_KERNEL); if (!fl) goto done; if (olen > 0) { struct msghdr msg; struct flowi6 flowi6; struct ipcm6_cookie ipc6; err = -ENOMEM; fl->opt = kmalloc(sizeof(*fl->opt) + olen, GFP_KERNEL); if (!fl->opt) goto done; memset(fl->opt, 0, sizeof(*fl->opt)); fl->opt->tot_len = sizeof(*fl->opt) + olen; err = -EFAULT; if (copy_from_sockptr_offset(fl->opt + 1, optval, CMSG_ALIGN(sizeof(*freq)), olen)) goto done; msg.msg_controllen = olen; msg.msg_control = (void *)(fl->opt+1); memset(&flowi6, 0, sizeof(flowi6)); ipc6.opt = fl->opt; err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, &ipc6); if (err) goto done; err = -EINVAL; if (fl->opt->opt_flen) goto done; if (fl->opt->opt_nflen == 0) { kfree(fl->opt); fl->opt = NULL; } } fl->fl_net = net; fl->expires = jiffies; err = fl6_renew(fl, freq->flr_linger, freq->flr_expires); if (err) goto done; fl->share = freq->flr_share; addr_type = ipv6_addr_type(&freq->flr_dst); if ((addr_type & IPV6_ADDR_MAPPED) || addr_type == IPV6_ADDR_ANY) { err = -EINVAL; goto done; } fl->dst = freq->flr_dst; atomic_set(&fl->users, 1); switch (fl->share) { case IPV6_FL_S_EXCL: case IPV6_FL_S_ANY: break; case IPV6_FL_S_PROCESS: fl->owner.pid = get_task_pid(current, PIDTYPE_PID); break; case IPV6_FL_S_USER: fl->owner.uid = current_euid(); break; default: err = -EINVAL; goto done; } if (fl_shared_exclusive(fl) || fl->opt) { WRITE_ONCE(sock_net(sk)->ipv6.flowlabel_has_excl, 1); static_branch_deferred_inc(&ipv6_flowlabel_exclusive); } return fl; done: if (fl) { kfree(fl->opt); kfree(fl); } *err_p = err; return NULL; } static int mem_check(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_fl_socklist *sfl; int room = FL_MAX_SIZE - atomic_read(&fl_size); int count = 0; if (room > FL_MAX_SIZE - FL_MAX_PER_SOCK) return 0; rcu_read_lock(); for_each_sk_fl_rcu(np, sfl) count++; rcu_read_unlock(); if (room <= 0 || ((count >= FL_MAX_PER_SOCK || (count > 0 && room < FL_MAX_SIZE/2) || room < FL_MAX_SIZE/4) && !capable(CAP_NET_ADMIN))) return -ENOBUFS; return 0; } static inline void fl_link(struct ipv6_pinfo *np, struct ipv6_fl_socklist *sfl, struct ip6_flowlabel *fl) { spin_lock_bh(&ip6_sk_fl_lock); sfl->fl = fl; sfl->next = np->ipv6_fl_list; rcu_assign_pointer(np->ipv6_fl_list, sfl); spin_unlock_bh(&ip6_sk_fl_lock); } int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq, int flags) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_fl_socklist *sfl; if (flags & IPV6_FL_F_REMOTE) { freq->flr_label = np->rcv_flowinfo & IPV6_FLOWLABEL_MASK; return 0; } if (inet6_test_bit(REPFLOW, sk)) { freq->flr_label = np->flow_label; return 0; } rcu_read_lock(); for_each_sk_fl_rcu(np, sfl) { if (sfl->fl->label == (np->flow_label & IPV6_FLOWLABEL_MASK)) { spin_lock_bh(&ip6_fl_lock); freq->flr_label = sfl->fl->label; freq->flr_dst = sfl->fl->dst; freq->flr_share = sfl->fl->share; freq->flr_expires = (sfl->fl->expires - jiffies) / HZ; freq->flr_linger = sfl->fl->linger / HZ; spin_unlock_bh(&ip6_fl_lock); rcu_read_unlock(); return 0; } } rcu_read_unlock(); return -ENOENT; } #define socklist_dereference(__sflp) \ rcu_dereference_protected(__sflp, lockdep_is_held(&ip6_sk_fl_lock)) static int ipv6_flowlabel_put(struct sock *sk, struct in6_flowlabel_req *freq) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_fl_socklist __rcu **sflp; struct ipv6_fl_socklist *sfl; if (freq->flr_flags & IPV6_FL_F_REFLECT) { if (sk->sk_protocol != IPPROTO_TCP) return -ENOPROTOOPT; if (!inet6_test_bit(REPFLOW, sk)) return -ESRCH; np->flow_label = 0; inet6_clear_bit(REPFLOW, sk); return 0; } spin_lock_bh(&ip6_sk_fl_lock); for (sflp = &np->ipv6_fl_list; (sfl = socklist_dereference(*sflp)) != NULL; sflp = &sfl->next) { if (sfl->fl->label == freq->flr_label) goto found; } spin_unlock_bh(&ip6_sk_fl_lock); return -ESRCH; found: if (freq->flr_label == (np->flow_label & IPV6_FLOWLABEL_MASK)) np->flow_label &= ~IPV6_FLOWLABEL_MASK; *sflp = sfl->next; spin_unlock_bh(&ip6_sk_fl_lock); fl_release(sfl->fl); kfree_rcu(sfl, rcu); return 0; } static int ipv6_flowlabel_renew(struct sock *sk, struct in6_flowlabel_req *freq) { struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); struct ipv6_fl_socklist *sfl; int err; rcu_read_lock(); for_each_sk_fl_rcu(np, sfl) { if (sfl->fl->label == freq->flr_label) { err = fl6_renew(sfl->fl, freq->flr_linger, freq->flr_expires); rcu_read_unlock(); return err; } } rcu_read_unlock(); if (freq->flr_share == IPV6_FL_S_NONE && ns_capable(net->user_ns, CAP_NET_ADMIN)) { struct ip6_flowlabel *fl = fl_lookup(net, freq->flr_label); if (fl) { err = fl6_renew(fl, freq->flr_linger, freq->flr_expires); fl_release(fl); return err; } } return -ESRCH; } static int ipv6_flowlabel_get(struct sock *sk, struct in6_flowlabel_req *freq, sockptr_t optval, int optlen) { struct ipv6_fl_socklist *sfl, *sfl1 = NULL; struct ip6_flowlabel *fl, *fl1 = NULL; struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); int err; if (freq->flr_flags & IPV6_FL_F_REFLECT) { if (net->ipv6.sysctl.flowlabel_consistency) { net_info_ratelimited("Can not set IPV6_FL_F_REFLECT if flowlabel_consistency sysctl is enable\n"); return -EPERM; } if (sk->sk_protocol != IPPROTO_TCP) return -ENOPROTOOPT; inet6_set_bit(REPFLOW, sk); return 0; } if (freq->flr_label & ~IPV6_FLOWLABEL_MASK) return -EINVAL; if (net->ipv6.sysctl.flowlabel_state_ranges && (freq->flr_label & IPV6_FLOWLABEL_STATELESS_FLAG)) return -ERANGE; fl = fl_create(net, sk, freq, optval, optlen, &err); if (!fl) return err; sfl1 = kmalloc(sizeof(*sfl1), GFP_KERNEL); if (freq->flr_label) { err = -EEXIST; rcu_read_lock(); for_each_sk_fl_rcu(np, sfl) { if (sfl->fl->label == freq->flr_label) { if (freq->flr_flags & IPV6_FL_F_EXCL) { rcu_read_unlock(); goto done; } fl1 = sfl->fl; if (!atomic_inc_not_zero(&fl1->users)) fl1 = NULL; break; } } rcu_read_unlock(); if (!fl1) fl1 = fl_lookup(net, freq->flr_label); if (fl1) { recheck: err = -EEXIST; if (freq->flr_flags&IPV6_FL_F_EXCL) goto release; err = -EPERM; if (fl1->share == IPV6_FL_S_EXCL || fl1->share != fl->share || ((fl1->share == IPV6_FL_S_PROCESS) && (fl1->owner.pid != fl->owner.pid)) || ((fl1->share == IPV6_FL_S_USER) && !uid_eq(fl1->owner.uid, fl->owner.uid))) goto release; err = -ENOMEM; if (!sfl1) goto release; if (fl->linger > fl1->linger) fl1->linger = fl->linger; if ((long)(fl->expires - fl1->expires) > 0) fl1->expires = fl->expires; fl_link(np, sfl1, fl1); fl_free(fl); return 0; release: fl_release(fl1); goto done; } } err = -ENOENT; if (!(freq->flr_flags & IPV6_FL_F_CREATE)) goto done; err = -ENOMEM; if (!sfl1) goto done; err = mem_check(sk); if (err != 0) goto done; fl1 = fl_intern(net, fl, freq->flr_label); if (fl1) goto recheck; if (!freq->flr_label) { size_t offset = offsetof(struct in6_flowlabel_req, flr_label); if (copy_to_sockptr_offset(optval, offset, &fl->label, sizeof(fl->label))) { /* Intentionally ignore fault. */ } } fl_link(np, sfl1, fl); return 0; done: fl_free(fl); kfree(sfl1); return err; } int ipv6_flowlabel_opt(struct sock *sk, sockptr_t optval, int optlen) { struct in6_flowlabel_req freq; if (optlen < sizeof(freq)) return -EINVAL; if (copy_from_sockptr(&freq, optval, sizeof(freq))) return -EFAULT; switch (freq.flr_action) { case IPV6_FL_A_PUT: return ipv6_flowlabel_put(sk, &freq); case IPV6_FL_A_RENEW: return ipv6_flowlabel_renew(sk, &freq); case IPV6_FL_A_GET: return ipv6_flowlabel_get(sk, &freq, optval, optlen); default: return -EINVAL; } } #ifdef CONFIG_PROC_FS struct ip6fl_iter_state { struct seq_net_private p; struct pid_namespace *pid_ns; int bucket; }; #define ip6fl_seq_private(seq) ((struct ip6fl_iter_state *)(seq)->private) static struct ip6_flowlabel *ip6fl_get_first(struct seq_file *seq) { struct ip6_flowlabel *fl = NULL; struct ip6fl_iter_state *state = ip6fl_seq_private(seq); struct net *net = seq_file_net(seq); for (state->bucket = 0; state->bucket <= FL_HASH_MASK; ++state->bucket) { for_each_fl_rcu(state->bucket, fl) { if (net_eq(fl->fl_net, net)) goto out; } } fl = NULL; out: return fl; } static struct ip6_flowlabel *ip6fl_get_next(struct seq_file *seq, struct ip6_flowlabel *fl) { struct ip6fl_iter_state *state = ip6fl_seq_private(seq); struct net *net = seq_file_net(seq); for_each_fl_continue_rcu(fl) { if (net_eq(fl->fl_net, net)) goto out; } try_again: if (++state->bucket <= FL_HASH_MASK) { for_each_fl_rcu(state->bucket, fl) { if (net_eq(fl->fl_net, net)) goto out; } goto try_again; } fl = NULL; out: return fl; } static struct ip6_flowlabel *ip6fl_get_idx(struct seq_file *seq, loff_t pos) { struct ip6_flowlabel *fl = ip6fl_get_first(seq); if (fl) while (pos && (fl = ip6fl_get_next(seq, fl)) != NULL) --pos; return pos ? NULL : fl; } static void *ip6fl_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct ip6fl_iter_state *state = ip6fl_seq_private(seq); state->pid_ns = proc_pid_ns(file_inode(seq->file)->i_sb); rcu_read_lock(); return *pos ? ip6fl_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } static void *ip6fl_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ip6_flowlabel *fl; if (v == SEQ_START_TOKEN) fl = ip6fl_get_first(seq); else fl = ip6fl_get_next(seq, v); ++*pos; return fl; } static void ip6fl_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int ip6fl_seq_show(struct seq_file *seq, void *v) { struct ip6fl_iter_state *state = ip6fl_seq_private(seq); if (v == SEQ_START_TOKEN) { seq_puts(seq, "Label S Owner Users Linger Expires Dst Opt\n"); } else { struct ip6_flowlabel *fl = v; seq_printf(seq, "%05X %-1d %-6d %-6d %-6ld %-8ld %pi6 %-4d\n", (unsigned int)ntohl(fl->label), fl->share, ((fl->share == IPV6_FL_S_PROCESS) ? pid_nr_ns(fl->owner.pid, state->pid_ns) : ((fl->share == IPV6_FL_S_USER) ? from_kuid_munged(seq_user_ns(seq), fl->owner.uid) : 0)), atomic_read(&fl->users), fl->linger/HZ, (long)(fl->expires - jiffies)/HZ, &fl->dst, fl->opt ? fl->opt->opt_nflen : 0); } return 0; } static const struct seq_operations ip6fl_seq_ops = { .start = ip6fl_seq_start, .next = ip6fl_seq_next, .stop = ip6fl_seq_stop, .show = ip6fl_seq_show, }; static int __net_init ip6_flowlabel_proc_init(struct net *net) { if (!proc_create_net("ip6_flowlabel", 0444, net->proc_net, &ip6fl_seq_ops, sizeof(struct ip6fl_iter_state))) return -ENOMEM; return 0; } static void __net_exit ip6_flowlabel_proc_fini(struct net *net) { remove_proc_entry("ip6_flowlabel", net->proc_net); } #else static inline int ip6_flowlabel_proc_init(struct net *net) { return 0; } static inline void ip6_flowlabel_proc_fini(struct net *net) { } #endif static void __net_exit ip6_flowlabel_net_exit(struct net *net) { ip6_fl_purge(net); ip6_flowlabel_proc_fini(net); } static struct pernet_operations ip6_flowlabel_net_ops = { .init = ip6_flowlabel_proc_init, .exit = ip6_flowlabel_net_exit, }; int ip6_flowlabel_init(void) { return register_pernet_subsys(&ip6_flowlabel_net_ops); } void ip6_flowlabel_cleanup(void) { static_key_deferred_flush(&ipv6_flowlabel_exclusive); del_timer(&ip6_fl_gc_timer); unregister_pernet_subsys(&ip6_flowlabel_net_ops); } |
| 32 23 30 9 32 31 5 8 58 58 34 57 22 2 33 27 32 32 24 59 1 58 25 32 30 59 60 3 7 23 30 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ * Copyright (C) 2022, Alibaba Cloud */ #include "xattr.h" #include <trace/events/erofs.h> struct erofs_qstr { const unsigned char *name; const unsigned char *end; }; /* based on the end of qn is accurate and it must have the trailing '\0' */ static inline int erofs_dirnamecmp(const struct erofs_qstr *qn, const struct erofs_qstr *qd, unsigned int *matched) { unsigned int i = *matched; /* * on-disk error, let's only BUG_ON in the debugging mode. * otherwise, it will return 1 to just skip the invalid name * and go on (in consideration of the lookup performance). */ DBG_BUGON(qd->name > qd->end); /* qd could not have trailing '\0' */ /* However it is absolutely safe if < qd->end */ while (qd->name + i < qd->end && qd->name[i] != '\0') { if (qn->name[i] != qd->name[i]) { *matched = i; return qn->name[i] > qd->name[i] ? 1 : -1; } ++i; } *matched = i; /* See comments in __d_alloc on the terminating NUL character */ return qn->name[i] == '\0' ? 0 : 1; } #define nameoff_from_disk(off, sz) (le16_to_cpu(off) & ((sz) - 1)) static struct erofs_dirent *find_target_dirent(struct erofs_qstr *name, u8 *data, unsigned int dirblksize, const int ndirents) { int head, back; unsigned int startprfx, endprfx; struct erofs_dirent *const de = (struct erofs_dirent *)data; /* since the 1st dirent has been evaluated previously */ head = 1; back = ndirents - 1; startprfx = endprfx = 0; while (head <= back) { const int mid = head + (back - head) / 2; const int nameoff = nameoff_from_disk(de[mid].nameoff, dirblksize); unsigned int matched = min(startprfx, endprfx); struct erofs_qstr dname = { .name = data + nameoff, .end = mid >= ndirents - 1 ? data + dirblksize : data + nameoff_from_disk(de[mid + 1].nameoff, dirblksize) }; /* string comparison without already matched prefix */ int ret = erofs_dirnamecmp(name, &dname, &matched); if (!ret) { return de + mid; } else if (ret > 0) { head = mid + 1; startprfx = matched; } else { back = mid - 1; endprfx = matched; } } return ERR_PTR(-ENOENT); } static void *erofs_find_target_block(struct erofs_buf *target, struct inode *dir, struct erofs_qstr *name, int *_ndirents) { unsigned int bsz = i_blocksize(dir); int head = 0, back = erofs_iblks(dir) - 1; unsigned int startprfx = 0, endprfx = 0; void *candidate = ERR_PTR(-ENOENT); while (head <= back) { const int mid = head + (back - head) / 2; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_dirent *de; buf.mapping = dir->i_mapping; de = erofs_bread(&buf, erofs_pos(dir->i_sb, mid), EROFS_KMAP); if (!IS_ERR(de)) { const int nameoff = nameoff_from_disk(de->nameoff, bsz); const int ndirents = nameoff / sizeof(*de); int diff; unsigned int matched; struct erofs_qstr dname; if (!ndirents) { erofs_put_metabuf(&buf); erofs_err(dir->i_sb, "corrupted dir block %d @ nid %llu", mid, EROFS_I(dir)->nid); DBG_BUGON(1); de = ERR_PTR(-EFSCORRUPTED); goto out; } matched = min(startprfx, endprfx); dname.name = (u8 *)de + nameoff; if (ndirents == 1) dname.end = (u8 *)de + bsz; else dname.end = (u8 *)de + nameoff_from_disk(de[1].nameoff, bsz); /* string comparison without already matched prefix */ diff = erofs_dirnamecmp(name, &dname, &matched); if (diff < 0) { erofs_put_metabuf(&buf); back = mid - 1; endprfx = matched; continue; } if (!IS_ERR(candidate)) erofs_put_metabuf(target); *target = buf; if (!diff) { *_ndirents = 0; return de; } head = mid + 1; startprfx = matched; candidate = de; *_ndirents = ndirents; continue; } out: /* free if the candidate is valid */ if (!IS_ERR(candidate)) erofs_put_metabuf(target); return de; } return candidate; } int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid, unsigned int *d_type) { int ndirents; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_dirent *de; struct erofs_qstr qn; if (!dir->i_size) return -ENOENT; qn.name = name->name; qn.end = name->name + name->len; buf.mapping = dir->i_mapping; ndirents = 0; de = erofs_find_target_block(&buf, dir, &qn, &ndirents); if (IS_ERR(de)) return PTR_ERR(de); if (ndirents) de = find_target_dirent(&qn, (u8 *)de, i_blocksize(dir), ndirents); if (!IS_ERR(de)) { *nid = le64_to_cpu(de->nid); *d_type = de->file_type; } erofs_put_metabuf(&buf); return PTR_ERR_OR_ZERO(de); } static struct dentry *erofs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { int err; erofs_nid_t nid; unsigned int d_type; struct inode *inode; trace_erofs_lookup(dir, dentry, flags); if (dentry->d_name.len > EROFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); err = erofs_namei(dir, &dentry->d_name, &nid, &d_type); if (err == -ENOENT) /* negative dentry */ inode = NULL; else if (err) inode = ERR_PTR(err); else inode = erofs_iget(dir->i_sb, nid); return d_splice_alias(inode, dentry); } const struct inode_operations erofs_dir_iops = { .lookup = erofs_lookup, .getattr = erofs_getattr, .listxattr = erofs_listxattr, .get_inode_acl = erofs_get_acl, .fiemap = erofs_fiemap, }; |
| 810 814 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 | // SPDX-License-Identifier: GPL-2.0 /* * Device physical location support * * Author: Won Chung <wonchung@google.com> */ #include <linux/acpi.h> #include <linux/sysfs.h> #include "physical_location.h" bool dev_add_physical_location(struct device *dev) { struct acpi_pld_info *pld; acpi_status status; if (!has_acpi_companion(dev)) return false; status = acpi_get_physical_device_location(ACPI_HANDLE(dev), &pld); if (ACPI_FAILURE(status)) return false; dev->physical_location = kzalloc(sizeof(*dev->physical_location), GFP_KERNEL); if (!dev->physical_location) { ACPI_FREE(pld); return false; } dev->physical_location->panel = pld->panel; dev->physical_location->vertical_position = pld->vertical_position; dev->physical_location->horizontal_position = pld->horizontal_position; dev->physical_location->dock = pld->dock; dev->physical_location->lid = pld->lid; ACPI_FREE(pld); return true; } static ssize_t panel_show(struct device *dev, struct device_attribute *attr, char *buf) { const char *panel; switch (dev->physical_location->panel) { case DEVICE_PANEL_TOP: panel = "top"; break; case DEVICE_PANEL_BOTTOM: panel = "bottom"; break; case DEVICE_PANEL_LEFT: panel = "left"; break; case DEVICE_PANEL_RIGHT: panel = "right"; break; case DEVICE_PANEL_FRONT: panel = "front"; break; case DEVICE_PANEL_BACK: panel = "back"; break; default: panel = "unknown"; } return sysfs_emit(buf, "%s\n", panel); } static DEVICE_ATTR_RO(panel); static ssize_t vertical_position_show(struct device *dev, struct device_attribute *attr, char *buf) { const char *vertical_position; switch (dev->physical_location->vertical_position) { case DEVICE_VERT_POS_UPPER: vertical_position = "upper"; break; case DEVICE_VERT_POS_CENTER: vertical_position = "center"; break; case DEVICE_VERT_POS_LOWER: vertical_position = "lower"; break; default: vertical_position = "unknown"; } return sysfs_emit(buf, "%s\n", vertical_position); } static DEVICE_ATTR_RO(vertical_position); static ssize_t horizontal_position_show(struct device *dev, struct device_attribute *attr, char *buf) { const char *horizontal_position; switch (dev->physical_location->horizontal_position) { case DEVICE_HORI_POS_LEFT: horizontal_position = "left"; break; case DEVICE_HORI_POS_CENTER: horizontal_position = "center"; break; case DEVICE_HORI_POS_RIGHT: horizontal_position = "right"; break; default: horizontal_position = "unknown"; } return sysfs_emit(buf, "%s\n", horizontal_position); } static DEVICE_ATTR_RO(horizontal_position); static ssize_t dock_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", dev->physical_location->dock ? "yes" : "no"); } static DEVICE_ATTR_RO(dock); static ssize_t lid_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", dev->physical_location->lid ? "yes" : "no"); } static DEVICE_ATTR_RO(lid); static struct attribute *dev_attr_physical_location[] = { &dev_attr_panel.attr, &dev_attr_vertical_position.attr, &dev_attr_horizontal_position.attr, &dev_attr_dock.attr, &dev_attr_lid.attr, NULL, }; const struct attribute_group dev_attr_physical_location_group = { .name = "physical_location", .attrs = dev_attr_physical_location, }; |
| 408 370 319 7 2 4 2 39 47 293 294 294 294 294 294 293 293 15 13 381 381 261 255 23 23 120 120 4 117 11 120 4 120 120 5 5 248 246 12 245 249 249 249 134 152 44 15 58 59 59 59 34 2 36 36 36 21 21 21 1 20 20 20 6 6 6 6 6 6 15 15 18 258 246 244 96 1 83 16 96 10 8 2 10 10 31 1 10 7 1 20 28 7 3 4 7 7 43 28 2 19 3 31 11 11 28 1 18 1 1 40 2 11 31 34 12 42 42 41 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_bmap.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_health.h" #include "xfs_bmap_btree.h" #include "xfs_trans_space.h" #include "xfs_parent.h" #include "xfs_ag.h" #include "xfs_ialloc.h" const struct xfs_name xfs_name_dotdot = { .name = (const unsigned char *)"..", .len = 2, .type = XFS_DIR3_FT_DIR, }; const struct xfs_name xfs_name_dot = { .name = (const unsigned char *)".", .len = 1, .type = XFS_DIR3_FT_DIR, }; /* * Convert inode mode to directory entry filetype */ unsigned char xfs_mode_to_ftype( int mode) { switch (mode & S_IFMT) { case S_IFREG: return XFS_DIR3_FT_REG_FILE; case S_IFDIR: return XFS_DIR3_FT_DIR; case S_IFCHR: return XFS_DIR3_FT_CHRDEV; case S_IFBLK: return XFS_DIR3_FT_BLKDEV; case S_IFIFO: return XFS_DIR3_FT_FIFO; case S_IFSOCK: return XFS_DIR3_FT_SOCK; case S_IFLNK: return XFS_DIR3_FT_SYMLINK; default: return XFS_DIR3_FT_UNKNOWN; } } /* * ASCII case-insensitive (ie. A-Z) support for directories that was * used in IRIX. */ xfs_dahash_t xfs_ascii_ci_hashname( const struct xfs_name *name) { xfs_dahash_t hash; int i; for (i = 0, hash = 0; i < name->len; i++) hash = xfs_ascii_ci_xfrm(name->name[i]) ^ rol32(hash, 7); return hash; } enum xfs_dacmp xfs_ascii_ci_compname( struct xfs_da_args *args, const unsigned char *name, int len) { enum xfs_dacmp result; int i; if (args->namelen != len) return XFS_CMP_DIFFERENT; result = XFS_CMP_EXACT; for (i = 0; i < len; i++) { if (args->name[i] == name[i]) continue; if (xfs_ascii_ci_xfrm(args->name[i]) != xfs_ascii_ci_xfrm(name[i])) return XFS_CMP_DIFFERENT; result = XFS_CMP_CASE; } return result; } int xfs_da_mount( struct xfs_mount *mp) { struct xfs_da_geometry *dageo; ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT); ASSERT(xfs_dir2_dirblock_bytes(&mp->m_sb) <= XFS_MAX_BLOCKSIZE); mp->m_dir_geo = kzalloc(sizeof(struct xfs_da_geometry), GFP_KERNEL | __GFP_RETRY_MAYFAIL); mp->m_attr_geo = kzalloc(sizeof(struct xfs_da_geometry), GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!mp->m_dir_geo || !mp->m_attr_geo) { kfree(mp->m_dir_geo); kfree(mp->m_attr_geo); return -ENOMEM; } /* set up directory geometry */ dageo = mp->m_dir_geo; dageo->blklog = mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog; dageo->fsblog = mp->m_sb.sb_blocklog; dageo->blksize = xfs_dir2_dirblock_bytes(&mp->m_sb); dageo->fsbcount = 1 << mp->m_sb.sb_dirblklog; if (xfs_has_crc(mp)) { dageo->node_hdr_size = sizeof(struct xfs_da3_node_hdr); dageo->leaf_hdr_size = sizeof(struct xfs_dir3_leaf_hdr); dageo->free_hdr_size = sizeof(struct xfs_dir3_free_hdr); dageo->data_entry_offset = sizeof(struct xfs_dir3_data_hdr); } else { dageo->node_hdr_size = sizeof(struct xfs_da_node_hdr); dageo->leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr); dageo->free_hdr_size = sizeof(struct xfs_dir2_free_hdr); dageo->data_entry_offset = sizeof(struct xfs_dir2_data_hdr); } dageo->leaf_max_ents = (dageo->blksize - dageo->leaf_hdr_size) / sizeof(struct xfs_dir2_leaf_entry); dageo->free_max_bests = (dageo->blksize - dageo->free_hdr_size) / sizeof(xfs_dir2_data_off_t); dageo->data_first_offset = dageo->data_entry_offset + xfs_dir2_data_entsize(mp, 1) + xfs_dir2_data_entsize(mp, 2); /* * Now we've set up the block conversion variables, we can calculate the * segment block constants using the geometry structure. */ dageo->datablk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_DATA_OFFSET); dageo->leafblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_LEAF_OFFSET); dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET); dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) / (uint)sizeof(xfs_da_node_entry_t); dageo->max_extents = (XFS_DIR2_MAX_SPACES * XFS_DIR2_SPACE_SIZE) >> mp->m_sb.sb_blocklog; dageo->magicpct = (dageo->blksize * 37) / 100; /* set up attribute geometry - single fsb only */ dageo = mp->m_attr_geo; dageo->blklog = mp->m_sb.sb_blocklog; dageo->fsblog = mp->m_sb.sb_blocklog; dageo->blksize = 1 << dageo->blklog; dageo->fsbcount = 1; dageo->node_hdr_size = mp->m_dir_geo->node_hdr_size; dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) / (uint)sizeof(xfs_da_node_entry_t); if (xfs_has_large_extent_counts(mp)) dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_LARGE; else dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_SMALL; dageo->magicpct = (dageo->blksize * 37) / 100; return 0; } void xfs_da_unmount( struct xfs_mount *mp) { kfree(mp->m_dir_geo); kfree(mp->m_attr_geo); } /* * Return 1 if directory contains only "." and "..". */ int xfs_dir_isempty( xfs_inode_t *dp) { xfs_dir2_sf_hdr_t *sfp; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); if (dp->i_disk_size == 0) /* might happen during shutdown. */ return 1; if (dp->i_disk_size > xfs_inode_data_fork_size(dp)) return 0; sfp = dp->i_df.if_data; return !sfp->count; } /* * Validate a given inode number. */ int xfs_dir_ino_validate( xfs_mount_t *mp, xfs_ino_t ino) { bool ino_ok = xfs_verify_dir_ino(mp, ino); if (XFS_IS_CORRUPT(mp, !ino_ok) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DIR_INO_VALIDATE)) { xfs_warn(mp, "Invalid inode number 0x%Lx", (unsigned long long) ino); return -EFSCORRUPTED; } return 0; } /* * Initialize a directory with its "." and ".." entries. */ int xfs_dir_init( xfs_trans_t *tp, xfs_inode_t *dp, xfs_inode_t *pdp) { struct xfs_da_args *args; int error; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino); if (error) return error; args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL); if (!args) return -ENOMEM; args->geo = dp->i_mount->m_dir_geo; args->dp = dp; args->trans = tp; args->owner = dp->i_ino; error = xfs_dir2_sf_create(args, pdp->i_ino); kfree(args); return error; } enum xfs_dir2_fmt xfs_dir2_format( struct xfs_da_args *args, int *error) { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; struct xfs_da_geometry *geo = mp->m_dir_geo; xfs_fileoff_t eof; xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); *error = 0; if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) return XFS_DIR2_FMT_SF; *error = xfs_bmap_last_offset(dp, &eof, XFS_DATA_FORK); if (*error) return XFS_DIR2_FMT_ERROR; if (eof == XFS_B_TO_FSB(mp, geo->blksize)) { if (XFS_IS_CORRUPT(mp, dp->i_disk_size != geo->blksize)) { xfs_da_mark_sick(args); *error = -EFSCORRUPTED; return XFS_DIR2_FMT_ERROR; } return XFS_DIR2_FMT_BLOCK; } if (eof == geo->leafblk + geo->fsbcount) return XFS_DIR2_FMT_LEAF; return XFS_DIR2_FMT_NODE; } int xfs_dir_createname_args( struct xfs_da_args *args) { int error; if (!args->inumber) args->op_flags |= XFS_DA_OP_JUSTCHECK; switch (xfs_dir2_format(args, &error)) { case XFS_DIR2_FMT_SF: return xfs_dir2_sf_addname(args); case XFS_DIR2_FMT_BLOCK: return xfs_dir2_block_addname(args); case XFS_DIR2_FMT_LEAF: return xfs_dir2_leaf_addname(args); case XFS_DIR2_FMT_NODE: return xfs_dir2_node_addname(args); default: return error; } } /* * Enter a name in a directory, or check for available space. * If inum is 0, only the available space test is performed. */ int xfs_dir_createname( struct xfs_trans *tp, struct xfs_inode *dp, const struct xfs_name *name, xfs_ino_t inum, /* new entry inode number */ xfs_extlen_t total) /* bmap's total block count */ { struct xfs_da_args *args; int rval; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); if (inum) { rval = xfs_dir_ino_validate(tp->t_mountp, inum); if (rval) return rval; XFS_STATS_INC(dp->i_mount, xs_dir_create); } args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL); if (!args) return -ENOMEM; args->geo = dp->i_mount->m_dir_geo; args->name = name->name; args->namelen = name->len; args->filetype = name->type; args->hashval = xfs_dir2_hashname(dp->i_mount, name); args->inumber = inum; args->dp = dp; args->total = total; args->whichfork = XFS_DATA_FORK; args->trans = tp; args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; args->owner = dp->i_ino; rval = xfs_dir_createname_args(args); kfree(args); return rval; } /* * If doing a CI lookup and case-insensitive match, dup actual name into * args.value. Return EEXIST for success (ie. name found) or an error. */ int xfs_dir_cilookup_result( struct xfs_da_args *args, const unsigned char *name, int len) { if (args->cmpresult == XFS_CMP_DIFFERENT) return -ENOENT; if (args->cmpresult != XFS_CMP_CASE || !(args->op_flags & XFS_DA_OP_CILOOKUP)) return -EEXIST; args->value = kmalloc(len, GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_RETRY_MAYFAIL); if (!args->value) return -ENOMEM; memcpy(args->value, name, len); args->valuelen = len; return -EEXIST; } int xfs_dir_lookup_args( struct xfs_da_args *args) { int error; switch (xfs_dir2_format(args, &error)) { case XFS_DIR2_FMT_SF: error = xfs_dir2_sf_lookup(args); break; case XFS_DIR2_FMT_BLOCK: error = xfs_dir2_block_lookup(args); break; case XFS_DIR2_FMT_LEAF: error = xfs_dir2_leaf_lookup(args); break; case XFS_DIR2_FMT_NODE: error = xfs_dir2_node_lookup(args); break; default: break; } if (error != -EEXIST) return error; return 0; } /* * Lookup a name in a directory, give back the inode number. * If ci_name is not NULL, returns the actual name in ci_name if it differs * to name, or ci_name->name is set to NULL for an exact match. */ int xfs_dir_lookup( struct xfs_trans *tp, struct xfs_inode *dp, const struct xfs_name *name, xfs_ino_t *inum, /* out: inode number */ struct xfs_name *ci_name) /* out: actual name if CI match */ { struct xfs_da_args *args; int rval; int lock_mode; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); XFS_STATS_INC(dp->i_mount, xs_dir_lookup); args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); args->geo = dp->i_mount->m_dir_geo; args->name = name->name; args->namelen = name->len; args->filetype = name->type; args->hashval = xfs_dir2_hashname(dp->i_mount, name); args->dp = dp; args->whichfork = XFS_DATA_FORK; args->trans = tp; args->op_flags = XFS_DA_OP_OKNOENT; args->owner = dp->i_ino; if (ci_name) args->op_flags |= XFS_DA_OP_CILOOKUP; lock_mode = xfs_ilock_data_map_shared(dp); rval = xfs_dir_lookup_args(args); if (!rval) { *inum = args->inumber; if (ci_name) { ci_name->name = args->value; ci_name->len = args->valuelen; } } xfs_iunlock(dp, lock_mode); kfree(args); return rval; } int xfs_dir_removename_args( struct xfs_da_args *args) { int error; switch (xfs_dir2_format(args, &error)) { case XFS_DIR2_FMT_SF: return xfs_dir2_sf_removename(args); case XFS_DIR2_FMT_BLOCK: return xfs_dir2_block_removename(args); case XFS_DIR2_FMT_LEAF: return xfs_dir2_leaf_removename(args); case XFS_DIR2_FMT_NODE: return xfs_dir2_node_removename(args); default: return error; } } /* * Remove an entry from a directory. */ int xfs_dir_removename( struct xfs_trans *tp, struct xfs_inode *dp, const struct xfs_name *name, xfs_ino_t ino, xfs_extlen_t total) /* bmap's total block count */ { struct xfs_da_args *args; int rval; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); XFS_STATS_INC(dp->i_mount, xs_dir_remove); args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL); if (!args) return -ENOMEM; args->geo = dp->i_mount->m_dir_geo; args->name = name->name; args->namelen = name->len; args->filetype = name->type; args->hashval = xfs_dir2_hashname(dp->i_mount, name); args->inumber = ino; args->dp = dp; args->total = total; args->whichfork = XFS_DATA_FORK; args->trans = tp; args->owner = dp->i_ino; rval = xfs_dir_removename_args(args); kfree(args); return rval; } int xfs_dir_replace_args( struct xfs_da_args *args) { int error; switch (xfs_dir2_format(args, &error)) { case XFS_DIR2_FMT_SF: return xfs_dir2_sf_replace(args); case XFS_DIR2_FMT_BLOCK: return xfs_dir2_block_replace(args); case XFS_DIR2_FMT_LEAF: return xfs_dir2_leaf_replace(args); case XFS_DIR2_FMT_NODE: return xfs_dir2_node_replace(args); default: return error; } } /* * Replace the inode number of a directory entry. */ int xfs_dir_replace( struct xfs_trans *tp, struct xfs_inode *dp, const struct xfs_name *name, /* name of entry to replace */ xfs_ino_t inum, /* new inode number */ xfs_extlen_t total) /* bmap's total block count */ { struct xfs_da_args *args; int rval; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); rval = xfs_dir_ino_validate(tp->t_mountp, inum); if (rval) return rval; args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL); if (!args) return -ENOMEM; args->geo = dp->i_mount->m_dir_geo; args->name = name->name; args->namelen = name->len; args->filetype = name->type; args->hashval = xfs_dir2_hashname(dp->i_mount, name); args->inumber = inum; args->dp = dp; args->total = total; args->whichfork = XFS_DATA_FORK; args->trans = tp; args->owner = dp->i_ino; rval = xfs_dir_replace_args(args); kfree(args); return rval; } /* * See if this entry can be added to the directory without allocating space. */ int xfs_dir_canenter( struct xfs_trans *tp, struct xfs_inode *dp, const struct xfs_name *name) /* name of entry to add */ { return xfs_dir_createname(tp, dp, name, 0, 0); } /* * Utility routines. */ /* * Add a block to the directory. * * This routine is for data and free blocks, not leaf/node blocks which are * handled by xfs_da_grow_inode. */ int xfs_dir2_grow_inode( struct xfs_da_args *args, int space, /* v2 dir's space XFS_DIR2_xxx_SPACE */ xfs_dir2_db_t *dbp) /* out: block number added */ { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; xfs_fileoff_t bno; /* directory offset of new block */ int count; /* count of filesystem blocks */ int error; trace_xfs_dir2_grow_inode(args, space); /* * Set lowest possible block in the space requested. */ bno = XFS_B_TO_FSBT(mp, space * XFS_DIR2_SPACE_SIZE); count = args->geo->fsbcount; error = xfs_da_grow_inode_int(args, &bno, count); if (error) return error; *dbp = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)bno); /* * Update file's size if this is the data space and it grew. */ if (space == XFS_DIR2_DATA_SPACE) { xfs_fsize_t size; /* directory file (data) size */ size = XFS_FSB_TO_B(mp, bno + count); if (size > dp->i_disk_size) { dp->i_disk_size = size; xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE); } } return 0; } /* * Remove the given block from the directory. * This routine is used for data and free blocks, leaf/node are done * by xfs_da_shrink_inode. */ int xfs_dir2_shrink_inode( struct xfs_da_args *args, xfs_dir2_db_t db, struct xfs_buf *bp) { xfs_fileoff_t bno; /* directory file offset */ xfs_dablk_t da; /* directory file offset */ int done; /* bunmap is finished */ struct xfs_inode *dp; int error; struct xfs_mount *mp; struct xfs_trans *tp; trace_xfs_dir2_shrink_inode(args, db); dp = args->dp; mp = dp->i_mount; tp = args->trans; da = xfs_dir2_db_to_da(args->geo, db); /* Unmap the fsblock(s). */ error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount, 0, 0, &done); if (error) { /* * ENOSPC actually can happen if we're in a removename with no * space reservation, and the resulting block removal would * cause a bmap btree split or conversion from extents to btree. * This can only happen for un-fragmented directory blocks, * since you need to be punching out the middle of an extent. * In this case we need to leave the block in the file, and not * binval it. So the block has to be in a consistent empty * state and appropriately logged. We don't free up the buffer, * the caller can tell it hasn't happened since it got an error * back. */ return error; } ASSERT(done); /* * Invalidate the buffer from the transaction. */ xfs_trans_binval(tp, bp); /* * If it's not a data block, we're done. */ if (db >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET)) return 0; /* * If the block isn't the last one in the directory, we're done. */ if (dp->i_disk_size > xfs_dir2_db_off_to_byte(args->geo, db + 1, 0)) return 0; bno = da; if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) { /* * This can't really happen unless there's kernel corruption. */ return error; } if (db == args->geo->datablk) ASSERT(bno == 0); else ASSERT(bno > 0); /* * Set the size to the new last block. */ dp->i_disk_size = XFS_FSB_TO_B(mp, bno); xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); return 0; } /* Returns true if the directory entry name is valid. */ bool xfs_dir2_namecheck( const void *name, size_t length) { /* * MAXNAMELEN includes the trailing null, but (name/length) leave it * out, so use >= for the length check. */ if (length >= MAXNAMELEN) return false; /* There shouldn't be any slashes or nulls here */ return !memchr(name, '/', length) && !memchr(name, 0, length); } xfs_dahash_t xfs_dir2_hashname( struct xfs_mount *mp, const struct xfs_name *name) { if (unlikely(xfs_has_asciici(mp))) return xfs_ascii_ci_hashname(name); return xfs_da_hashname(name->name, name->len); } enum xfs_dacmp xfs_dir2_compname( struct xfs_da_args *args, const unsigned char *name, int len) { if (unlikely(xfs_has_asciici(args->dp->i_mount))) return xfs_ascii_ci_compname(args, name, len); return xfs_da_compname(args, name, len); } #ifdef CONFIG_XFS_LIVE_HOOKS /* * Use a static key here to reduce the overhead of directory live update hooks. * If the compiler supports jump labels, the static branch will be replaced by * a nop sled when there are no hook users. Online fsck is currently the only * caller, so this is a reasonable tradeoff. * * Note: Patching the kernel code requires taking the cpu hotplug lock. Other * parts of the kernel allocate memory with that lock held, which means that * XFS callers cannot hold any locks that might be used by memory reclaim or * writeback when calling the static_branch_{inc,dec} functions. */ DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_dir_hooks_switch); void xfs_dir_hook_disable(void) { xfs_hooks_switch_off(&xfs_dir_hooks_switch); } void xfs_dir_hook_enable(void) { xfs_hooks_switch_on(&xfs_dir_hooks_switch); } /* Call hooks for a directory update relating to a child dirent update. */ inline void xfs_dir_update_hook( struct xfs_inode *dp, struct xfs_inode *ip, int delta, const struct xfs_name *name) { if (xfs_hooks_switched_on(&xfs_dir_hooks_switch)) { struct xfs_dir_update_params p = { .dp = dp, .ip = ip, .delta = delta, .name = name, }; struct xfs_mount *mp = ip->i_mount; xfs_hooks_call(&mp->m_dir_update_hooks, 0, &p); } } /* Call the specified function during a directory update. */ int xfs_dir_hook_add( struct xfs_mount *mp, struct xfs_dir_hook *hook) { return xfs_hooks_add(&mp->m_dir_update_hooks, &hook->dirent_hook); } /* Stop calling the specified function during a directory update. */ void xfs_dir_hook_del( struct xfs_mount *mp, struct xfs_dir_hook *hook) { xfs_hooks_del(&mp->m_dir_update_hooks, &hook->dirent_hook); } /* Configure directory update hook functions. */ void xfs_dir_hook_setup( struct xfs_dir_hook *hook, notifier_fn_t mod_fn) { xfs_hook_setup(&hook->dirent_hook, mod_fn); } #endif /* CONFIG_XFS_LIVE_HOOKS */ /* * Given a directory @dp, a newly allocated inode @ip, and a @name, link @ip * into @dp under the given @name. If @ip is a directory, it will be * initialized. Both inodes must have the ILOCK held and the transaction must * have sufficient blocks reserved. */ int xfs_dir_create_child( struct xfs_trans *tp, unsigned int resblks, struct xfs_dir_update *du) { struct xfs_inode *dp = du->dp; const struct xfs_name *name = du->name; struct xfs_inode *ip = du->ip; int error; xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); xfs_assert_ilocked(dp, XFS_ILOCK_EXCL); error = xfs_dir_createname(tp, dp, name, ip->i_ino, resblks); if (error) { ASSERT(error != -ENOSPC); return error; } xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); if (S_ISDIR(VFS_I(ip)->i_mode)) { error = xfs_dir_init(tp, ip, dp); if (error) return error; xfs_bumplink(tp, dp); } /* * If we have parent pointers, we need to add the attribute containing * the parent information now. */ if (du->ppargs) { error = xfs_parent_addname(tp, du->ppargs, dp, name, ip); if (error) return error; } xfs_dir_update_hook(dp, ip, 1, name); return 0; } /* * Given a directory @dp, an existing non-directory inode @ip, and a @name, * link @ip into @dp under the given @name. Both inodes must have the ILOCK * held. */ int xfs_dir_add_child( struct xfs_trans *tp, unsigned int resblks, struct xfs_dir_update *du) { struct xfs_inode *dp = du->dp; const struct xfs_name *name = du->name; struct xfs_inode *ip = du->ip; struct xfs_mount *mp = tp->t_mountp; int error; xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); xfs_assert_ilocked(dp, XFS_ILOCK_EXCL); ASSERT(!S_ISDIR(VFS_I(ip)->i_mode)); if (!resblks) { error = xfs_dir_canenter(tp, dp, name); if (error) return error; } /* * Handle initial link state of O_TMPFILE inode */ if (VFS_I(ip)->i_nlink == 0) { struct xfs_perag *pag; pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); error = xfs_iunlink_remove(tp, pag, ip); xfs_perag_put(pag); if (error) return error; } error = xfs_dir_createname(tp, dp, name, ip->i_ino, resblks); if (error) return error; xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); xfs_bumplink(tp, ip); /* * If we have parent pointers, we now need to add the parent record to * the attribute fork of the inode. If this is the initial parent * attribute, we need to create it correctly, otherwise we can just add * the parent to the inode. */ if (du->ppargs) { error = xfs_parent_addname(tp, du->ppargs, dp, name, ip); if (error) return error; } xfs_dir_update_hook(dp, ip, 1, name); return 0; } /* * Given a directory @dp, a child @ip, and a @name, remove the (@name, @ip) * entry from the directory. Both inodes must have the ILOCK held. */ int xfs_dir_remove_child( struct xfs_trans *tp, unsigned int resblks, struct xfs_dir_update *du) { struct xfs_inode *dp = du->dp; const struct xfs_name *name = du->name; struct xfs_inode *ip = du->ip; int error; xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); xfs_assert_ilocked(dp, XFS_ILOCK_EXCL); /* * If we're removing a directory perform some additional validation. */ if (S_ISDIR(VFS_I(ip)->i_mode)) { ASSERT(VFS_I(ip)->i_nlink >= 2); if (VFS_I(ip)->i_nlink != 2) return -ENOTEMPTY; if (!xfs_dir_isempty(ip)) return -ENOTEMPTY; /* Drop the link from ip's "..". */ error = xfs_droplink(tp, dp); if (error) return error; /* Drop the "." link from ip to self. */ error = xfs_droplink(tp, ip); if (error) return error; /* * Point the unlinked child directory's ".." entry to the root * directory to eliminate back-references to inodes that may * get freed before the child directory is closed. If the fs * gets shrunk, this can lead to dirent inode validation errors. */ if (dp->i_ino != tp->t_mountp->m_sb.sb_rootino) { error = xfs_dir_replace(tp, ip, &xfs_name_dotdot, tp->t_mountp->m_sb.sb_rootino, 0); if (error) return error; } } else { /* * When removing a non-directory we need to log the parent * inode here. For a directory this is done implicitly * by the xfs_droplink call for the ".." entry. */ xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); } xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); /* Drop the link from dp to ip. */ error = xfs_droplink(tp, ip); if (error) return error; error = xfs_dir_removename(tp, dp, name, ip->i_ino, resblks); if (error) { ASSERT(error != -ENOENT); return error; } /* Remove parent pointer. */ if (du->ppargs) { error = xfs_parent_removename(tp, du->ppargs, dp, name, ip); if (error) return error; } xfs_dir_update_hook(dp, ip, -1, name); return 0; } /* * Exchange the entry (@name1, @ip1) in directory @dp1 with the entry (@name2, * @ip2) in directory @dp2, and update '..' @ip1 and @ip2's entries as needed. * @ip1 and @ip2 need not be of the same type. * * All inodes must have the ILOCK held, and both entries must already exist. */ int xfs_dir_exchange_children( struct xfs_trans *tp, struct xfs_dir_update *du1, struct xfs_dir_update *du2, unsigned int spaceres) { struct xfs_inode *dp1 = du1->dp; const struct xfs_name *name1 = du1->name; struct xfs_inode *ip1 = du1->ip; struct xfs_inode *dp2 = du2->dp; const struct xfs_name *name2 = du2->name; struct xfs_inode *ip2 = du2->ip; int ip1_flags = 0; int ip2_flags = 0; int dp2_flags = 0; int error; /* Swap inode number for dirent in first parent */ error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, spaceres); if (error) return error; /* Swap inode number for dirent in second parent */ error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, spaceres); if (error) return error; /* * If we're renaming one or more directories across different parents, * update the respective ".." entries (and link counts) to match the new * parents. */ if (dp1 != dp2) { dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; if (S_ISDIR(VFS_I(ip2)->i_mode)) { error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot, dp1->i_ino, spaceres); if (error) return error; /* transfer ip2 ".." reference to dp1 */ if (!S_ISDIR(VFS_I(ip1)->i_mode)) { error = xfs_droplink(tp, dp2); if (error) return error; xfs_bumplink(tp, dp1); } /* * Although ip1 isn't changed here, userspace needs * to be warned about the change, so that applications * relying on it (like backup ones), will properly * notify the change */ ip1_flags |= XFS_ICHGTIME_CHG; ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; } if (S_ISDIR(VFS_I(ip1)->i_mode)) { error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot, dp2->i_ino, spaceres); if (error) return error; /* transfer ip1 ".." reference to dp2 */ if (!S_ISDIR(VFS_I(ip2)->i_mode)) { error = xfs_droplink(tp, dp1); if (error) return error; xfs_bumplink(tp, dp2); } /* * Although ip2 isn't changed here, userspace needs * to be warned about the change, so that applications * relying on it (like backup ones), will properly * notify the change */ ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; ip2_flags |= XFS_ICHGTIME_CHG; } } if (ip1_flags) { xfs_trans_ichgtime(tp, ip1, ip1_flags); xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE); } if (ip2_flags) { xfs_trans_ichgtime(tp, ip2, ip2_flags); xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE); } if (dp2_flags) { xfs_trans_ichgtime(tp, dp2, dp2_flags); xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE); } xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE); /* Schedule parent pointer replacements */ if (du1->ppargs) { error = xfs_parent_replacename(tp, du1->ppargs, dp1, name1, dp2, name2, ip1); if (error) return error; } if (du2->ppargs) { error = xfs_parent_replacename(tp, du2->ppargs, dp2, name2, dp1, name1, ip2); if (error) return error; } /* * Inform our hook clients that we've finished an exchange operation as * follows: removed the source and target files from their directories; * added the target to the source directory; and added the source to * the target directory. All inodes are locked, so it's ok to model a * rename this way so long as we say we deleted entries before we add * new ones. */ xfs_dir_update_hook(dp1, ip1, -1, name1); xfs_dir_update_hook(dp2, ip2, -1, name2); xfs_dir_update_hook(dp1, ip2, 1, name1); xfs_dir_update_hook(dp2, ip1, 1, name2); return 0; } /* * Given an entry (@src_name, @src_ip) in directory @src_dp, make the entry * @target_name in directory @target_dp point to @src_ip and remove the * original entry, cleaning up everything left behind. * * Cleanup involves dropping a link count on @target_ip, and either removing * the (@src_name, @src_ip) entry from @src_dp or simply replacing the entry * with (@src_name, @wip) if a whiteout inode @wip is supplied. * * All inodes must have the ILOCK held. We assume that if @src_ip is a * directory then its '..' doesn't already point to @target_dp, and that @wip * is a freshly allocated whiteout. */ int xfs_dir_rename_children( struct xfs_trans *tp, struct xfs_dir_update *du_src, struct xfs_dir_update *du_tgt, unsigned int spaceres, struct xfs_dir_update *du_wip) { struct xfs_mount *mp = tp->t_mountp; struct xfs_inode *src_dp = du_src->dp; const struct xfs_name *src_name = du_src->name; struct xfs_inode *src_ip = du_src->ip; struct xfs_inode *target_dp = du_tgt->dp; const struct xfs_name *target_name = du_tgt->name; struct xfs_inode *target_ip = du_tgt->ip; bool new_parent = (src_dp != target_dp); bool src_is_directory; int error; src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode); /* * Check for expected errors before we dirty the transaction * so we can return an error without a transaction abort. */ if (target_ip == NULL) { /* * If there's no space reservation, check the entry will * fit before actually inserting it. */ if (!spaceres) { error = xfs_dir_canenter(tp, target_dp, target_name); if (error) return error; } } else { /* * If target exists and it's a directory, check that whether * it can be destroyed. */ if (S_ISDIR(VFS_I(target_ip)->i_mode) && (!xfs_dir_isempty(target_ip) || (VFS_I(target_ip)->i_nlink > 2))) return -EEXIST; } /* * Directory entry creation below may acquire the AGF. Remove * the whiteout from the unlinked list first to preserve correct * AGI/AGF locking order. This dirties the transaction so failures * after this point will abort and log recovery will clean up the * mess. * * For whiteouts, we need to bump the link count on the whiteout * inode. After this point, we have a real link, clear the tmpfile * state flag from the inode so it doesn't accidentally get misused * in future. */ if (du_wip->ip) { struct xfs_perag *pag; ASSERT(VFS_I(du_wip->ip)->i_nlink == 0); pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, du_wip->ip->i_ino)); error = xfs_iunlink_remove(tp, pag, du_wip->ip); xfs_perag_put(pag); if (error) return error; xfs_bumplink(tp, du_wip->ip); } /* * Set up the target. */ if (target_ip == NULL) { /* * If target does not exist and the rename crosses * directories, adjust the target directory link count * to account for the ".." reference from the new entry. */ error = xfs_dir_createname(tp, target_dp, target_name, src_ip->i_ino, spaceres); if (error) return error; xfs_trans_ichgtime(tp, target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); if (new_parent && src_is_directory) { xfs_bumplink(tp, target_dp); } } else { /* target_ip != NULL */ /* * Link the source inode under the target name. * If the source inode is a directory and we are moving * it across directories, its ".." entry will be * inconsistent until we replace that down below. * * In case there is already an entry with the same * name at the destination directory, remove it first. */ error = xfs_dir_replace(tp, target_dp, target_name, src_ip->i_ino, spaceres); if (error) return error; xfs_trans_ichgtime(tp, target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); /* * Decrement the link count on the target since the target * dir no longer points to it. */ error = xfs_droplink(tp, target_ip); if (error) return error; if (src_is_directory) { /* * Drop the link from the old "." entry. */ error = xfs_droplink(tp, target_ip); if (error) return error; } } /* target_ip != NULL */ /* * Remove the source. */ if (new_parent && src_is_directory) { /* * Rewrite the ".." entry to point to the new * directory. */ error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot, target_dp->i_ino, spaceres); ASSERT(error != -EEXIST); if (error) return error; } /* * We always want to hit the ctime on the source inode. * * This isn't strictly required by the standards since the source * inode isn't really being changed, but old unix file systems did * it and some incremental backup programs won't work without it. */ xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE); /* * Adjust the link count on src_dp. This is necessary when * renaming a directory, either within one parent when * the target existed, or across two parent directories. */ if (src_is_directory && (new_parent || target_ip != NULL)) { /* * Decrement link count on src_directory since the * entry that's moved no longer points to it. */ error = xfs_droplink(tp, src_dp); if (error) return error; } /* * For whiteouts, we only need to update the source dirent with the * inode number of the whiteout inode rather than removing it * altogether. */ if (du_wip->ip) error = xfs_dir_replace(tp, src_dp, src_name, du_wip->ip->i_ino, spaceres); else error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, spaceres); if (error) return error; xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); if (new_parent) xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); /* Schedule parent pointer updates. */ if (du_wip->ppargs) { error = xfs_parent_addname(tp, du_wip->ppargs, src_dp, src_name, du_wip->ip); if (error) return error; } if (du_src->ppargs) { error = xfs_parent_replacename(tp, du_src->ppargs, src_dp, src_name, target_dp, target_name, src_ip); if (error) return error; } if (du_tgt->ppargs) { error = xfs_parent_removename(tp, du_tgt->ppargs, target_dp, target_name, target_ip); if (error) return error; } /* * Inform our hook clients that we've finished a rename operation as * follows: removed the source and target files from their directories; * that we've added the source to the target directory; and finally * that we've added the whiteout, if there was one. All inodes are * locked, so it's ok to model a rename this way so long as we say we * deleted entries before we add new ones. */ if (target_ip) xfs_dir_update_hook(target_dp, target_ip, -1, target_name); xfs_dir_update_hook(src_dp, src_ip, -1, src_name); xfs_dir_update_hook(target_dp, src_ip, 1, target_name); if (du_wip->ip) xfs_dir_update_hook(src_dp, du_wip->ip, 1, src_name); return 0; } |
| 7 2 32 72 32 8 78 78 1 2 58 35 4 85 50 84 14 56 56 53 28 6 35 49 32 13 24 1 1 1 25 1 41 109 56 70 70 34 63 23 9 20 52 31 26 2 6 48 42 4 3 42 1 44 3 43 1 7 3 3 4 7 5 2 4 6 20 42 1 4 6 2 3 2 4 2 3 6 109 46 37 34 9 9 8 1 1 8 9 7 7 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/isofs/rock.c * * (C) 1992, 1993 Eric Youngdale * * Rock Ridge Extensions to iso9660 */ #include <linux/slab.h> #include <linux/pagemap.h> #include "isofs.h" #include "rock.h" /* * These functions are designed to read the system areas of a directory record * and extract relevant information. There are different functions provided * depending upon what information we need at the time. One function fills * out an inode structure, a second one extracts a filename, a third one * returns a symbolic link name, and a fourth one returns the extent number * for the file. */ #define SIG(A,B) ((A) | ((B) << 8)) /* isonum_721() */ struct rock_state { void *buffer; unsigned char *chr; int len; int cont_size; int cont_extent; int cont_offset; int cont_loops; struct inode *inode; }; /* * This is a way of ensuring that we have something in the system * use fields that is compatible with Rock Ridge. Return zero on success. */ static int check_sp(struct rock_ridge *rr, struct inode *inode) { if (rr->u.SP.magic[0] != 0xbe) return -1; if (rr->u.SP.magic[1] != 0xef) return -1; ISOFS_SB(inode->i_sb)->s_rock_offset = rr->u.SP.skip; return 0; } static void setup_rock_ridge(struct iso_directory_record *de, struct inode *inode, struct rock_state *rs) { rs->len = sizeof(struct iso_directory_record) + de->name_len[0]; if (rs->len & 1) (rs->len)++; rs->chr = (unsigned char *)de + rs->len; rs->len = *((unsigned char *)de) - rs->len; if (rs->len < 0) rs->len = 0; if (ISOFS_SB(inode->i_sb)->s_rock_offset != -1) { rs->len -= ISOFS_SB(inode->i_sb)->s_rock_offset; rs->chr += ISOFS_SB(inode->i_sb)->s_rock_offset; if (rs->len < 0) rs->len = 0; } } static void init_rock_state(struct rock_state *rs, struct inode *inode) { memset(rs, 0, sizeof(*rs)); rs->inode = inode; } /* Maximum number of Rock Ridge continuation entries */ #define RR_MAX_CE_ENTRIES 32 /* * Returns 0 if the caller should continue scanning, 1 if the scan must end * and -ve on error. */ static int rock_continue(struct rock_state *rs) { int ret = 1; int blocksize = 1 << rs->inode->i_blkbits; const int min_de_size = offsetof(struct rock_ridge, u); kfree(rs->buffer); rs->buffer = NULL; if ((unsigned)rs->cont_offset > blocksize - min_de_size || (unsigned)rs->cont_size > blocksize || (unsigned)(rs->cont_offset + rs->cont_size) > blocksize) { printk(KERN_NOTICE "rock: corrupted directory entry. " "extent=%d, offset=%d, size=%d\n", rs->cont_extent, rs->cont_offset, rs->cont_size); ret = -EIO; goto out; } if (rs->cont_extent) { struct buffer_head *bh; rs->buffer = kmalloc(rs->cont_size, GFP_KERNEL); if (!rs->buffer) { ret = -ENOMEM; goto out; } ret = -EIO; if (++rs->cont_loops >= RR_MAX_CE_ENTRIES) goto out; bh = sb_bread(rs->inode->i_sb, rs->cont_extent); if (bh) { memcpy(rs->buffer, bh->b_data + rs->cont_offset, rs->cont_size); put_bh(bh); rs->chr = rs->buffer; rs->len = rs->cont_size; rs->cont_extent = 0; rs->cont_size = 0; rs->cont_offset = 0; return 0; } printk("Unable to read rock-ridge attributes\n"); } out: kfree(rs->buffer); rs->buffer = NULL; return ret; } /* * We think there's a record of type `sig' at rs->chr. Parse the signature * and make sure that there's really room for a record of that type. */ static int rock_check_overflow(struct rock_state *rs, int sig) { int len; switch (sig) { case SIG('S', 'P'): len = sizeof(struct SU_SP_s); break; case SIG('C', 'E'): len = sizeof(struct SU_CE_s); break; case SIG('E', 'R'): len = sizeof(struct SU_ER_s); break; case SIG('R', 'R'): len = sizeof(struct RR_RR_s); break; case SIG('P', 'X'): len = sizeof(struct RR_PX_s); break; case SIG('P', 'N'): len = sizeof(struct RR_PN_s); break; case SIG('S', 'L'): len = sizeof(struct RR_SL_s); break; case SIG('N', 'M'): len = sizeof(struct RR_NM_s); break; case SIG('C', 'L'): len = sizeof(struct RR_CL_s); break; case SIG('P', 'L'): len = sizeof(struct RR_PL_s); break; case SIG('T', 'F'): len = sizeof(struct RR_TF_s); break; case SIG('Z', 'F'): len = sizeof(struct RR_ZF_s); break; default: len = 0; break; } len += offsetof(struct rock_ridge, u); if (len > rs->len) { printk(KERN_NOTICE "rock: directory entry would overflow " "storage\n"); printk(KERN_NOTICE "rock: sig=0x%02x, size=%d, remaining=%d\n", sig, len, rs->len); return -EIO; } return 0; } /* * return length of name field; 0: not found, -1: to be ignored */ int get_rock_ridge_filename(struct iso_directory_record *de, char *retname, struct inode *inode) { struct rock_state rs; struct rock_ridge *rr; int sig; int retnamlen = 0; int truncate = 0; int ret = 0; char *p; int len; if (!ISOFS_SB(inode->i_sb)->s_rock) return 0; *retname = 0; init_rock_state(&rs, inode); setup_rock_ridge(de, inode, &rs); repeat: while (rs.len > 2) { /* There may be one byte for padding somewhere */ rr = (struct rock_ridge *)rs.chr; /* * Ignore rock ridge info if rr->len is out of range, but * don't return -EIO because that would make the file * invisible. */ if (rr->len < 3) goto out; /* Something got screwed up here */ sig = isonum_721(rs.chr); if (rock_check_overflow(&rs, sig)) goto eio; rs.chr += rr->len; rs.len -= rr->len; /* * As above, just ignore the rock ridge info if rr->len * is bogus. */ if (rs.len < 0) goto out; /* Something got screwed up here */ switch (sig) { case SIG('R', 'R'): if ((rr->u.RR.flags[0] & RR_NM) == 0) goto out; break; case SIG('S', 'P'): if (check_sp(rr, inode)) goto out; break; case SIG('C', 'E'): rs.cont_extent = isonum_733(rr->u.CE.extent); rs.cont_offset = isonum_733(rr->u.CE.offset); rs.cont_size = isonum_733(rr->u.CE.size); break; case SIG('N', 'M'): if (truncate) break; if (rr->len < 5) break; /* * If the flags are 2 or 4, this indicates '.' or '..'. * We don't want to do anything with this, because it * screws up the code that calls us. We don't really * care anyways, since we can just use the non-RR * name. */ if (rr->u.NM.flags & 6) break; if (rr->u.NM.flags & ~1) { printk("Unsupported NM flag settings (%d)\n", rr->u.NM.flags); break; } len = rr->len - 5; if (retnamlen + len >= 254) { truncate = 1; break; } p = memchr(rr->u.NM.name, '\0', len); if (unlikely(p)) len = p - rr->u.NM.name; memcpy(retname + retnamlen, rr->u.NM.name, len); retnamlen += len; retname[retnamlen] = '\0'; break; case SIG('R', 'E'): kfree(rs.buffer); return -1; default: break; } } ret = rock_continue(&rs); if (ret == 0) goto repeat; if (ret == 1) return retnamlen; /* If 0, this file did not have a NM field */ out: kfree(rs.buffer); return ret; eio: ret = -EIO; goto out; } #define RR_REGARD_XA 1 #define RR_RELOC_DE 2 static int parse_rock_ridge_inode_internal(struct iso_directory_record *de, struct inode *inode, int flags) { int symlink_len = 0; int cnt, sig; unsigned int reloc_block; struct inode *reloc; struct rock_ridge *rr; int rootflag; struct rock_state rs; int ret = 0; if (!ISOFS_SB(inode->i_sb)->s_rock) return 0; init_rock_state(&rs, inode); setup_rock_ridge(de, inode, &rs); if (flags & RR_REGARD_XA) { rs.chr += 14; rs.len -= 14; if (rs.len < 0) rs.len = 0; } repeat: while (rs.len > 2) { /* There may be one byte for padding somewhere */ rr = (struct rock_ridge *)rs.chr; /* * Ignore rock ridge info if rr->len is out of range, but * don't return -EIO because that would make the file * invisible. */ if (rr->len < 3) goto out; /* Something got screwed up here */ sig = isonum_721(rs.chr); if (rock_check_overflow(&rs, sig)) goto eio; rs.chr += rr->len; rs.len -= rr->len; /* * As above, just ignore the rock ridge info if rr->len * is bogus. */ if (rs.len < 0) goto out; /* Something got screwed up here */ switch (sig) { #ifndef CONFIG_ZISOFS /* No flag for SF or ZF */ case SIG('R', 'R'): if ((rr->u.RR.flags[0] & (RR_PX | RR_TF | RR_SL | RR_CL)) == 0) goto out; break; #endif case SIG('S', 'P'): if (check_sp(rr, inode)) goto out; break; case SIG('C', 'E'): rs.cont_extent = isonum_733(rr->u.CE.extent); rs.cont_offset = isonum_733(rr->u.CE.offset); rs.cont_size = isonum_733(rr->u.CE.size); break; case SIG('E', 'R'): /* Invalid length of ER tag id? */ if (rr->u.ER.len_id + offsetof(struct rock_ridge, u.ER.data) > rr->len) goto out; ISOFS_SB(inode->i_sb)->s_rock = 1; printk(KERN_DEBUG "ISO 9660 Extensions: "); { int p; for (p = 0; p < rr->u.ER.len_id; p++) printk(KERN_CONT "%c", rr->u.ER.data[p]); } printk(KERN_CONT "\n"); break; case SIG('P', 'X'): inode->i_mode = isonum_733(rr->u.PX.mode); set_nlink(inode, isonum_733(rr->u.PX.n_links)); i_uid_write(inode, isonum_733(rr->u.PX.uid)); i_gid_write(inode, isonum_733(rr->u.PX.gid)); break; case SIG('P', 'N'): { int high, low; high = isonum_733(rr->u.PN.dev_high); low = isonum_733(rr->u.PN.dev_low); /* * The Rock Ridge standard specifies that if * sizeof(dev_t) <= 4, then the high field is * unused, and the device number is completely * stored in the low field. Some writers may * ignore this subtlety, * and as a result we test to see if the entire * device number is * stored in the low field, and use that. */ if ((low & ~0xff) && high == 0) { inode->i_rdev = MKDEV(low >> 8, low & 0xff); } else { inode->i_rdev = MKDEV(high, low); } } break; case SIG('T', 'F'): /* * Some RRIP writers incorrectly place ctime in the * TF_CREATE field. Try to handle this correctly for * either case. */ /* Rock ridge never appears on a High Sierra disk */ cnt = 0; if (rr->u.TF.flags & TF_CREATE) { inode_set_ctime(inode, iso_date(rr->u.TF.times[cnt++].time, 0), 0); } if (rr->u.TF.flags & TF_MODIFY) { inode_set_mtime(inode, iso_date(rr->u.TF.times[cnt++].time, 0), 0); } if (rr->u.TF.flags & TF_ACCESS) { inode_set_atime(inode, iso_date(rr->u.TF.times[cnt++].time, 0), 0); } if (rr->u.TF.flags & TF_ATTRIBUTES) { inode_set_ctime(inode, iso_date(rr->u.TF.times[cnt++].time, 0), 0); } break; case SIG('S', 'L'): { int slen; struct SL_component *slp; struct SL_component *oldslp; slen = rr->len - 5; slp = &rr->u.SL.link; inode->i_size = symlink_len; while (slen > 1) { rootflag = 0; switch (slp->flags & ~1) { case 0: inode->i_size += slp->len; break; case 2: inode->i_size += 1; break; case 4: inode->i_size += 2; break; case 8: rootflag = 1; inode->i_size += 1; break; default: printk("Symlink component flag " "not implemented\n"); } slen -= slp->len + 2; oldslp = slp; slp = (struct SL_component *) (((char *)slp) + slp->len + 2); if (slen < 2) { if (((rr->u.SL. flags & 1) != 0) && ((oldslp-> flags & 1) == 0)) inode->i_size += 1; break; } /* * If this component record isn't * continued, then append a '/'. */ if (!rootflag && (oldslp->flags & 1) == 0) inode->i_size += 1; } } symlink_len = inode->i_size; break; case SIG('R', 'E'): printk(KERN_WARNING "Attempt to read inode for " "relocated directory\n"); goto out; case SIG('C', 'L'): if (flags & RR_RELOC_DE) { printk(KERN_ERR "ISOFS: Recursive directory relocation " "is not supported\n"); goto eio; } reloc_block = isonum_733(rr->u.CL.location); if (reloc_block == ISOFS_I(inode)->i_iget5_block && ISOFS_I(inode)->i_iget5_offset == 0) { printk(KERN_ERR "ISOFS: Directory relocation points to " "itself\n"); goto eio; } ISOFS_I(inode)->i_first_extent = reloc_block; reloc = isofs_iget_reloc(inode->i_sb, reloc_block, 0); if (IS_ERR(reloc)) { ret = PTR_ERR(reloc); goto out; } inode->i_mode = reloc->i_mode; set_nlink(inode, reloc->i_nlink); inode->i_uid = reloc->i_uid; inode->i_gid = reloc->i_gid; inode->i_rdev = reloc->i_rdev; inode->i_size = reloc->i_size; inode->i_blocks = reloc->i_blocks; inode_set_atime_to_ts(inode, inode_get_atime(reloc)); inode_set_ctime_to_ts(inode, inode_get_ctime(reloc)); inode_set_mtime_to_ts(inode, inode_get_mtime(reloc)); iput(reloc); break; #ifdef CONFIG_ZISOFS case SIG('Z', 'F'): { int algo; if (ISOFS_SB(inode->i_sb)->s_nocompress) break; algo = isonum_721(rr->u.ZF.algorithm); if (algo == SIG('p', 'z')) { int block_shift = isonum_711(&rr->u.ZF.parms[1]); if (block_shift > 17) { printk(KERN_WARNING "isofs: " "Can't handle ZF block " "size of 2^%d\n", block_shift); } else { /* * Note: we don't change * i_blocks here */ ISOFS_I(inode)->i_file_format = isofs_file_compressed; /* * Parameters to compression * algorithm (header size, * block size) */ ISOFS_I(inode)->i_format_parm[0] = isonum_711(&rr->u.ZF.parms[0]); ISOFS_I(inode)->i_format_parm[1] = isonum_711(&rr->u.ZF.parms[1]); inode->i_size = isonum_733(rr->u.ZF. real_size); } } else { printk(KERN_WARNING "isofs: Unknown ZF compression " "algorithm: %c%c\n", rr->u.ZF.algorithm[0], rr->u.ZF.algorithm[1]); } break; } #endif default: break; } } ret = rock_continue(&rs); if (ret == 0) goto repeat; if (ret == 1) ret = 0; out: kfree(rs.buffer); return ret; eio: ret = -EIO; goto out; } static char *get_symlink_chunk(char *rpnt, struct rock_ridge *rr, char *plimit) { int slen; int rootflag; struct SL_component *oldslp; struct SL_component *slp; slen = rr->len - 5; slp = &rr->u.SL.link; while (slen > 1) { rootflag = 0; switch (slp->flags & ~1) { case 0: if (slp->len > plimit - rpnt) return NULL; memcpy(rpnt, slp->text, slp->len); rpnt += slp->len; break; case 2: if (rpnt >= plimit) return NULL; *rpnt++ = '.'; break; case 4: if (2 > plimit - rpnt) return NULL; *rpnt++ = '.'; *rpnt++ = '.'; break; case 8: if (rpnt >= plimit) return NULL; rootflag = 1; *rpnt++ = '/'; break; default: printk("Symlink component flag not implemented (%d)\n", slp->flags); } slen -= slp->len + 2; oldslp = slp; slp = (struct SL_component *)((char *)slp + slp->len + 2); if (slen < 2) { /* * If there is another SL record, and this component * record isn't continued, then add a slash. */ if ((!rootflag) && (rr->u.SL.flags & 1) && !(oldslp->flags & 1)) { if (rpnt >= plimit) return NULL; *rpnt++ = '/'; } break; } /* * If this component record isn't continued, then append a '/'. */ if (!rootflag && !(oldslp->flags & 1)) { if (rpnt >= plimit) return NULL; *rpnt++ = '/'; } } return rpnt; } int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode, int relocated) { int flags = relocated ? RR_RELOC_DE : 0; int result = parse_rock_ridge_inode_internal(de, inode, flags); /* * if rockridge flag was reset and we didn't look for attributes * behind eventual XA attributes, have a look there */ if ((ISOFS_SB(inode->i_sb)->s_rock_offset == -1) && (ISOFS_SB(inode->i_sb)->s_rock == 2)) { result = parse_rock_ridge_inode_internal(de, inode, flags | RR_REGARD_XA); } return result; } /* * read_folio() for symlinks: reads symlink contents into the folio and either * makes it uptodate and returns 0 or returns error (-EIO) */ static int rock_ridge_symlink_read_folio(struct file *file, struct folio *folio) { struct inode *inode = folio->mapping->host; struct iso_inode_info *ei = ISOFS_I(inode); struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb); char *link = folio_address(folio); unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); struct buffer_head *bh; char *rpnt = link; unsigned char *pnt; struct iso_directory_record *raw_de; unsigned long block, offset; int sig; struct rock_ridge *rr; struct rock_state rs; int ret; if (!sbi->s_rock) goto error; init_rock_state(&rs, inode); block = ei->i_iget5_block; bh = sb_bread(inode->i_sb, block); if (!bh) goto out_noread; offset = ei->i_iget5_offset; pnt = (unsigned char *)bh->b_data + offset; raw_de = (struct iso_directory_record *)pnt; /* * If we go past the end of the buffer, there is some sort of error. */ if (offset + *pnt > bufsize) goto out_bad_span; /* * Now test for possible Rock Ridge extensions which will override * some of these numbers in the inode structure. */ setup_rock_ridge(raw_de, inode, &rs); repeat: while (rs.len > 2) { /* There may be one byte for padding somewhere */ rr = (struct rock_ridge *)rs.chr; if (rr->len < 3) goto out; /* Something got screwed up here */ sig = isonum_721(rs.chr); if (rock_check_overflow(&rs, sig)) goto out; rs.chr += rr->len; rs.len -= rr->len; if (rs.len < 0) goto out; /* corrupted isofs */ switch (sig) { case SIG('R', 'R'): if ((rr->u.RR.flags[0] & RR_SL) == 0) goto out; break; case SIG('S', 'P'): if (check_sp(rr, inode)) goto out; break; case SIG('S', 'L'): rpnt = get_symlink_chunk(rpnt, rr, link + (PAGE_SIZE - 1)); if (rpnt == NULL) goto out; break; case SIG('C', 'E'): /* This tells is if there is a continuation record */ rs.cont_extent = isonum_733(rr->u.CE.extent); rs.cont_offset = isonum_733(rr->u.CE.offset); rs.cont_size = isonum_733(rr->u.CE.size); break; default: break; } } ret = rock_continue(&rs); if (ret == 0) goto repeat; if (ret < 0) goto fail; if (rpnt == link) goto fail; brelse(bh); *rpnt = '\0'; ret = 0; end: folio_end_read(folio, ret == 0); return ret; /* error exit from macro */ out: kfree(rs.buffer); goto fail; out_noread: printk("unable to read i-node block"); goto fail; out_bad_span: printk("symlink spans iso9660 blocks\n"); fail: brelse(bh); error: ret = -EIO; goto end; } const struct address_space_operations isofs_symlink_aops = { .read_folio = rock_ridge_symlink_read_folio }; |
| 16 9 16 19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 | /* * linux/fs/nls/nls_iso8859-3.c * * Charset iso8859-3 translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00*/ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10*/ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20*/ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30*/ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40*/ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50*/ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60*/ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70*/ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80*/ 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* 0x90*/ 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* 0xa0*/ 0x00a0, 0x0126, 0x02d8, 0x00a3, 0x00a4, 0x0000, 0x0124, 0x00a7, 0x00a8, 0x0130, 0x015e, 0x011e, 0x0134, 0x00ad, 0x0000, 0x017b, /* 0xb0*/ 0x00b0, 0x0127, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x0125, 0x00b7, 0x00b8, 0x0131, 0x015f, 0x011f, 0x0135, 0x00bd, 0x0000, 0x017c, /* 0xc0*/ 0x00c0, 0x00c1, 0x00c2, 0x0000, 0x00c4, 0x010a, 0x0108, 0x00c7, 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, /* 0xd0*/ 0x0000, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x0120, 0x00d6, 0x00d7, 0x011c, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x016c, 0x015c, 0x00df, /* 0xe0*/ 0x00e0, 0x00e1, 0x00e2, 0x0000, 0x00e4, 0x010b, 0x0109, 0x00e7, 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, /* 0xf0*/ 0x0000, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x0121, 0x00f6, 0x00f7, 0x011d, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x016d, 0x015d, 0x02d9, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0x00, 0x00, 0xa3, 0xa4, 0x00, 0x00, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0x00, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, /* 0xa8-0xaf */ 0xb0, 0x00, 0xb2, 0xb3, 0xb4, 0xb5, 0x00, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0x00, 0x00, 0x00, 0x00, 0xbd, 0x00, 0x00, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0x00, 0xc4, 0x00, 0x00, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0x00, 0xd1, 0xd2, 0xd3, 0xd4, 0x00, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0x00, 0xd9, 0xda, 0xdb, 0xdc, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0x00, 0xe4, 0x00, 0x00, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0x00, 0xf1, 0xf2, 0xf3, 0xf4, 0x00, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0x00, 0xf9, 0xfa, 0xfb, 0xfc, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page01[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0xc6, 0xe6, 0xc5, 0xe5, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0xd8, 0xf8, 0xab, 0xbb, /* 0x18-0x1f */ 0xd5, 0xf5, 0x00, 0x00, 0xa6, 0xb6, 0xa1, 0xb1, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0xa9, 0xb9, 0x00, 0x00, 0xac, 0xbc, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0xde, 0xfe, 0xaa, 0xba, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0xdd, 0xfd, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0xaf, 0xbf, 0x00, 0x00, 0x00, /* 0x78-0x7f */ }; static const unsigned char page02[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0xa2, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ }; static const unsigned char *const page_uni2charset[256] = { page00, page01, page02, NULL, NULL, NULL, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xb1, 0xa2, 0xa3, 0xa4, 0x00, 0xb6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0x69, 0xba, 0xbb, 0xbc, 0xad, 0x00, 0xbf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0x00, 0xbf, /* 0xb8-0xbf */ 0xe0, 0xe1, 0xe2, 0x00, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ 0x00, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0x00, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0x00, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0x00, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xa1, 0xb2, 0xb3, 0xb4, 0x00, 0xa6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0x49, 0xaa, 0xab, 0xac, 0xbd, 0x00, 0xaf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0x00, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0x00, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xc0, 0xc1, 0xc2, 0x00, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ 0x00, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xff, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "iso8859-3", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_iso8859_3(void) { return register_nls(&table); } static void __exit exit_nls_iso8859_3(void) { unregister_nls(&table); } module_init(init_nls_iso8859_3) module_exit(exit_nls_iso8859_3) MODULE_DESCRIPTION("NLS ISO 8859-3 (Latin 3; Esperanto, Galician, Maltese, Turkish)"); MODULE_LICENSE("Dual BSD/GPL"); |
| 145 68 58 59 64 197 22 22 22 22 22 22 22 22 79 79 79 79 58 22 79 79 79 77 35 352 35 217 50 79 58 129 329 329 329 217 146 258 124 88 242 243 244 146 146 135 10 146 145 145 98 98 94 93 94 63 33 21 196 195 196 194 2 79 141 4 46 98 165 147 60 81 59 98 98 96 76 76 76 76 217 216 3 217 217 216 197 216 97 96 93 17 17 17 4 4 6 6 5 6 37 37 34 34 17 17 17 31 17 31 17 17 17 17 17 17 6 17 144 145 50 127 128 145 145 143 144 93 6 88 88 88 145 59 121 145 144 145 145 47 47 47 47 47 43 43 133 53 133 128 6 82 98 186 4 9 179 179 179 170 15 185 6 23 165 56 143 177 6 3 1 123 57 5 3 16 43 48 5 4 42 42 184 184 136 38 33 33 13 17 4 13 13 141 1 5 6 136 136 124 124 1 47 99 18 5 15 18 2 18 9 9 3 6 9 4 4 39 15 1 1 90 90 86 39 61 11 2 9 3 86 86 83 4 76 14 1 18 2 41 18 11 10 2 59 37 3 6 4 24 33 36 35 6 36 15 39 1 4 43 75 3 3 49 67 11 11 11 69 11 3 4 83 39 61 90 4 1 2 1 2 1 1 1 1 1 1 18 16 2 6 11 3 8 4 5 6 3 4 5 6 5 2 2 4 5 6 27 27 27 27 27 23 1 23 23 23 11 4 4 11 11 27 27 27 12 27 27 27 27 5 27 16 11 17 17 17 12 5 31 31 4 27 72 72 39 35 3 3 3 3 3 40 40 39 77 77 11 12 7 48 4 4 41 44 72 72 66 8 71 1 71 71 72 66 8 17 64 10 17 17 17 18 53 53 69 63 8 69 18 53 95 89 95 94 95 3 2 56 2 62 16 16 80 56 24 56 2 1 1 25 45 16 95 95 95 16 82 4 1 93 1 98 80 19 98 98 93 19 89 95 95 95 21 90 95 95 95 94 95 1 94 95 95 1 1 95 95 98 97 97 38 67 86 96 13 15 73 1 98 99 16 1 97 1 98 98 95 1 1 95 99 99 9 2 7 78 30 44 6 45 1 25 3 11 38 10 6 6 70 72 13 64 9 4 4 3 9 9 9 6 5 5 9 2 8 7 2 87 87 86 65 46 86 10 78 6 81 82 21 26 20 1 1 1 7 82 82 66 2 34 82 2 74 9 23 67 60 8 82 36 34 31 27 3 5 32 32 216 216 216 215 40 200 70 187 181 5 42 42 12 4 11 1 10 1 3 5 10 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 | // SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/data.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ */ #include <linux/fs.h> #include <linux/f2fs_fs.h> #include <linux/sched/mm.h> #include <linux/mpage.h> #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/blkdev.h> #include <linux/bio.h> #include <linux/blk-crypto.h> #include <linux/swap.h> #include <linux/prefetch.h> #include <linux/uio.h> #include <linux/sched/signal.h> #include <linux/fiemap.h> #include <linux/iomap.h> #include "f2fs.h" #include "node.h" #include "segment.h" #include "iostat.h" #include <trace/events/f2fs.h> #define NUM_PREALLOC_POST_READ_CTXS 128 static struct kmem_cache *bio_post_read_ctx_cache; static struct kmem_cache *bio_entry_slab; static mempool_t *bio_post_read_ctx_pool; static struct bio_set f2fs_bioset; #define F2FS_BIO_POOL_SIZE NR_CURSEG_TYPE int __init f2fs_init_bioset(void) { return bioset_init(&f2fs_bioset, F2FS_BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); } void f2fs_destroy_bioset(void) { bioset_exit(&f2fs_bioset); } bool f2fs_is_cp_guaranteed(struct page *page) { struct address_space *mapping = page->mapping; struct inode *inode; struct f2fs_sb_info *sbi; if (!mapping) return false; inode = mapping->host; sbi = F2FS_I_SB(inode); if (inode->i_ino == F2FS_META_INO(sbi) || inode->i_ino == F2FS_NODE_INO(sbi) || S_ISDIR(inode->i_mode)) return true; if ((S_ISREG(inode->i_mode) && IS_NOQUOTA(inode)) || page_private_gcing(page)) return true; return false; } static enum count_type __read_io_type(struct page *page) { struct address_space *mapping = page_file_mapping(page); if (mapping) { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); if (inode->i_ino == F2FS_META_INO(sbi)) return F2FS_RD_META; if (inode->i_ino == F2FS_NODE_INO(sbi)) return F2FS_RD_NODE; } return F2FS_RD_DATA; } /* postprocessing steps for read bios */ enum bio_post_read_step { #ifdef CONFIG_FS_ENCRYPTION STEP_DECRYPT = BIT(0), #else STEP_DECRYPT = 0, /* compile out the decryption-related code */ #endif #ifdef CONFIG_F2FS_FS_COMPRESSION STEP_DECOMPRESS = BIT(1), #else STEP_DECOMPRESS = 0, /* compile out the decompression-related code */ #endif #ifdef CONFIG_FS_VERITY STEP_VERITY = BIT(2), #else STEP_VERITY = 0, /* compile out the verity-related code */ #endif }; struct bio_post_read_ctx { struct bio *bio; struct f2fs_sb_info *sbi; struct work_struct work; unsigned int enabled_steps; /* * decompression_attempted keeps track of whether * f2fs_end_read_compressed_page() has been called on the pages in the * bio that belong to a compressed cluster yet. */ bool decompression_attempted; block_t fs_blkaddr; }; /* * Update and unlock a bio's pages, and free the bio. * * This marks pages up-to-date only if there was no error in the bio (I/O error, * decryption error, or verity error), as indicated by bio->bi_status. * * "Compressed pages" (pagecache pages backed by a compressed cluster on-disk) * aren't marked up-to-date here, as decompression is done on a per-compression- * cluster basis rather than a per-bio basis. Instead, we only must do two * things for each compressed page here: call f2fs_end_read_compressed_page() * with failed=true if an error occurred before it would have normally gotten * called (i.e., I/O error or decryption error, but *not* verity error), and * release the bio's reference to the decompress_io_ctx of the page's cluster. */ static void f2fs_finish_read_bio(struct bio *bio, bool in_task) { struct bio_vec *bv; struct bvec_iter_all iter_all; struct bio_post_read_ctx *ctx = bio->bi_private; bio_for_each_segment_all(bv, bio, iter_all) { struct page *page = bv->bv_page; if (f2fs_is_compressed_page(page)) { if (ctx && !ctx->decompression_attempted) f2fs_end_read_compressed_page(page, true, 0, in_task); f2fs_put_page_dic(page, in_task); continue; } if (bio->bi_status) ClearPageUptodate(page); else SetPageUptodate(page); dec_page_count(F2FS_P_SB(page), __read_io_type(page)); unlock_page(page); } if (ctx) mempool_free(ctx, bio_post_read_ctx_pool); bio_put(bio); } static void f2fs_verify_bio(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); struct bio *bio = ctx->bio; bool may_have_compressed_pages = (ctx->enabled_steps & STEP_DECOMPRESS); /* * fsverity_verify_bio() may call readahead() again, and while verity * will be disabled for this, decryption and/or decompression may still * be needed, resulting in another bio_post_read_ctx being allocated. * So to prevent deadlocks we need to release the current ctx to the * mempool first. This assumes that verity is the last post-read step. */ mempool_free(ctx, bio_post_read_ctx_pool); bio->bi_private = NULL; /* * Verify the bio's pages with fs-verity. Exclude compressed pages, * as those were handled separately by f2fs_end_read_compressed_page(). */ if (may_have_compressed_pages) { struct bio_vec *bv; struct bvec_iter_all iter_all; bio_for_each_segment_all(bv, bio, iter_all) { struct page *page = bv->bv_page; if (!f2fs_is_compressed_page(page) && !fsverity_verify_page(page)) { bio->bi_status = BLK_STS_IOERR; break; } } } else { fsverity_verify_bio(bio); } f2fs_finish_read_bio(bio, true); } /* * If the bio's data needs to be verified with fs-verity, then enqueue the * verity work for the bio. Otherwise finish the bio now. * * Note that to avoid deadlocks, the verity work can't be done on the * decryption/decompression workqueue. This is because verifying the data pages * can involve reading verity metadata pages from the file, and these verity * metadata pages may be encrypted and/or compressed. */ static void f2fs_verify_and_finish_bio(struct bio *bio, bool in_task) { struct bio_post_read_ctx *ctx = bio->bi_private; if (ctx && (ctx->enabled_steps & STEP_VERITY)) { INIT_WORK(&ctx->work, f2fs_verify_bio); fsverity_enqueue_verify_work(&ctx->work); } else { f2fs_finish_read_bio(bio, in_task); } } /* * Handle STEP_DECOMPRESS by decompressing any compressed clusters whose last * remaining page was read by @ctx->bio. * * Note that a bio may span clusters (even a mix of compressed and uncompressed * clusters) or be for just part of a cluster. STEP_DECOMPRESS just indicates * that the bio includes at least one compressed page. The actual decompression * is done on a per-cluster basis, not a per-bio basis. */ static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx, bool in_task) { struct bio_vec *bv; struct bvec_iter_all iter_all; bool all_compressed = true; block_t blkaddr = ctx->fs_blkaddr; bio_for_each_segment_all(bv, ctx->bio, iter_all) { struct page *page = bv->bv_page; if (f2fs_is_compressed_page(page)) f2fs_end_read_compressed_page(page, false, blkaddr, in_task); else all_compressed = false; blkaddr++; } ctx->decompression_attempted = true; /* * Optimization: if all the bio's pages are compressed, then scheduling * the per-bio verity work is unnecessary, as verity will be fully * handled at the compression cluster level. */ if (all_compressed) ctx->enabled_steps &= ~STEP_VERITY; } static void f2fs_post_read_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); struct bio *bio = ctx->bio; if ((ctx->enabled_steps & STEP_DECRYPT) && !fscrypt_decrypt_bio(bio)) { f2fs_finish_read_bio(bio, true); return; } if (ctx->enabled_steps & STEP_DECOMPRESS) f2fs_handle_step_decompress(ctx, true); f2fs_verify_and_finish_bio(bio, true); } static void f2fs_read_end_io(struct bio *bio) { struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio)); struct bio_post_read_ctx *ctx; bool intask = in_task(); iostat_update_and_unbind_ctx(bio); ctx = bio->bi_private; if (time_to_inject(sbi, FAULT_READ_IO)) bio->bi_status = BLK_STS_IOERR; if (bio->bi_status) { f2fs_finish_read_bio(bio, intask); return; } if (ctx) { unsigned int enabled_steps = ctx->enabled_steps & (STEP_DECRYPT | STEP_DECOMPRESS); /* * If we have only decompression step between decompression and * decrypt, we don't need post processing for this. */ if (enabled_steps == STEP_DECOMPRESS && !f2fs_low_mem_mode(sbi)) { f2fs_handle_step_decompress(ctx, intask); } else if (enabled_steps) { INIT_WORK(&ctx->work, f2fs_post_read_work); queue_work(ctx->sbi->post_read_wq, &ctx->work); return; } } f2fs_verify_and_finish_bio(bio, intask); } static void f2fs_write_end_io(struct bio *bio) { struct f2fs_sb_info *sbi; struct bio_vec *bvec; struct bvec_iter_all iter_all; iostat_update_and_unbind_ctx(bio); sbi = bio->bi_private; if (time_to_inject(sbi, FAULT_WRITE_IO)) bio->bi_status = BLK_STS_IOERR; bio_for_each_segment_all(bvec, bio, iter_all) { struct page *page = bvec->bv_page; enum count_type type = WB_DATA_TYPE(page, false); fscrypt_finalize_bounce_page(&page); #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_is_compressed_page(page)) { f2fs_compress_write_end_io(bio, page); continue; } #endif if (unlikely(bio->bi_status)) { mapping_set_error(page->mapping, -EIO); if (type == F2FS_WB_CP_DATA) f2fs_stop_checkpoint(sbi, true, STOP_CP_REASON_WRITE_FAIL); } f2fs_bug_on(sbi, page->mapping == NODE_MAPPING(sbi) && page_folio(page)->index != nid_of_node(page)); dec_page_count(sbi, type); if (f2fs_in_warm_node_list(sbi, page)) f2fs_del_fsync_node_entry(sbi, page); clear_page_private_gcing(page); end_page_writeback(page); } if (!get_pages(sbi, F2FS_WB_CP_DATA) && wq_has_sleeper(&sbi->cp_wait)) wake_up(&sbi->cp_wait); bio_put(bio); } #ifdef CONFIG_BLK_DEV_ZONED static void f2fs_zone_write_end_io(struct bio *bio) { struct f2fs_bio_info *io = (struct f2fs_bio_info *)bio->bi_private; bio->bi_private = io->bi_private; complete(&io->zone_wait); f2fs_write_end_io(bio); } #endif struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, block_t blk_addr, sector_t *sector) { struct block_device *bdev = sbi->sb->s_bdev; int i; if (f2fs_is_multi_device(sbi)) { for (i = 0; i < sbi->s_ndevs; i++) { if (FDEV(i).start_blk <= blk_addr && FDEV(i).end_blk >= blk_addr) { blk_addr -= FDEV(i).start_blk; bdev = FDEV(i).bdev; break; } } } if (sector) *sector = SECTOR_FROM_BLOCK(blk_addr); return bdev; } int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) { int i; if (!f2fs_is_multi_device(sbi)) return 0; for (i = 0; i < sbi->s_ndevs; i++) if (FDEV(i).start_blk <= blkaddr && FDEV(i).end_blk >= blkaddr) return i; return 0; } static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio) { unsigned int temp_mask = GENMASK(NR_TEMP_TYPE - 1, 0); unsigned int fua_flag, meta_flag, io_flag; blk_opf_t op_flags = 0; if (fio->op != REQ_OP_WRITE) return 0; if (fio->type == DATA) io_flag = fio->sbi->data_io_flag; else if (fio->type == NODE) io_flag = fio->sbi->node_io_flag; else return 0; fua_flag = io_flag & temp_mask; meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask; /* * data/node io flag bits per temp: * REQ_META | REQ_FUA | * 5 | 4 | 3 | 2 | 1 | 0 | * Cold | Warm | Hot | Cold | Warm | Hot | */ if (BIT(fio->temp) & meta_flag) op_flags |= REQ_META; if (BIT(fio->temp) & fua_flag) op_flags |= REQ_FUA; return op_flags; } static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) { struct f2fs_sb_info *sbi = fio->sbi; struct block_device *bdev; sector_t sector; struct bio *bio; bdev = f2fs_target_device(sbi, fio->new_blkaddr, §or); bio = bio_alloc_bioset(bdev, npages, fio->op | fio->op_flags | f2fs_io_flags(fio), GFP_NOIO, &f2fs_bioset); bio->bi_iter.bi_sector = sector; if (is_read_io(fio->op)) { bio->bi_end_io = f2fs_read_end_io; bio->bi_private = NULL; } else { bio->bi_end_io = f2fs_write_end_io; bio->bi_private = sbi; bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi, fio->type, fio->temp); } iostat_alloc_and_bind_ctx(sbi, bio, NULL); if (fio->io_wbc) wbc_init_bio(fio->io_wbc, bio); return bio; } static void f2fs_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, pgoff_t first_idx, const struct f2fs_io_info *fio, gfp_t gfp_mask) { /* * The f2fs garbage collector sets ->encrypted_page when it wants to * read/write raw data without encryption. */ if (!fio || !fio->encrypted_page) fscrypt_set_bio_crypt_ctx(bio, inode, first_idx, gfp_mask); } static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode, pgoff_t next_idx, const struct f2fs_io_info *fio) { /* * The f2fs garbage collector sets ->encrypted_page when it wants to * read/write raw data without encryption. */ if (fio && fio->encrypted_page) return !bio_has_crypt_ctx(bio); return fscrypt_mergeable_bio(bio, inode, next_idx); } void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio, enum page_type type) { WARN_ON_ONCE(!is_read_io(bio_op(bio))); trace_f2fs_submit_read_bio(sbi->sb, type, bio); iostat_update_submit_ctx(bio, type); submit_bio(bio); } static void f2fs_submit_write_bio(struct f2fs_sb_info *sbi, struct bio *bio, enum page_type type) { WARN_ON_ONCE(is_read_io(bio_op(bio))); if (f2fs_lfs_mode(sbi) && current->plug && PAGE_TYPE_ON_MAIN(type)) blk_finish_plug(current->plug); trace_f2fs_submit_write_bio(sbi->sb, type, bio); iostat_update_submit_ctx(bio, type); submit_bio(bio); } static void __submit_merged_bio(struct f2fs_bio_info *io) { struct f2fs_io_info *fio = &io->fio; if (!io->bio) return; if (is_read_io(fio->op)) { trace_f2fs_prepare_read_bio(io->sbi->sb, fio->type, io->bio); f2fs_submit_read_bio(io->sbi, io->bio, fio->type); } else { trace_f2fs_prepare_write_bio(io->sbi->sb, fio->type, io->bio); f2fs_submit_write_bio(io->sbi, io->bio, fio->type); } io->bio = NULL; } static bool __has_merged_page(struct bio *bio, struct inode *inode, struct page *page, nid_t ino) { struct bio_vec *bvec; struct bvec_iter_all iter_all; if (!bio) return false; if (!inode && !page && !ino) return true; bio_for_each_segment_all(bvec, bio, iter_all) { struct page *target = bvec->bv_page; if (fscrypt_is_bounce_page(target)) { target = fscrypt_pagecache_page(target); if (IS_ERR(target)) continue; } if (f2fs_is_compressed_page(target)) { target = f2fs_compress_control_page(target); if (IS_ERR(target)) continue; } if (inode && inode == target->mapping->host) return true; if (page && page == target) return true; if (ino && ino == ino_of_node(target)) return true; } return false; } int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi) { int i; for (i = 0; i < NR_PAGE_TYPE; i++) { int n = (i == META) ? 1 : NR_TEMP_TYPE; int j; sbi->write_io[i] = f2fs_kmalloc(sbi, array_size(n, sizeof(struct f2fs_bio_info)), GFP_KERNEL); if (!sbi->write_io[i]) return -ENOMEM; for (j = HOT; j < n; j++) { struct f2fs_bio_info *io = &sbi->write_io[i][j]; init_f2fs_rwsem(&io->io_rwsem); io->sbi = sbi; io->bio = NULL; io->last_block_in_bio = 0; spin_lock_init(&io->io_lock); INIT_LIST_HEAD(&io->io_list); INIT_LIST_HEAD(&io->bio_list); init_f2fs_rwsem(&io->bio_list_lock); #ifdef CONFIG_BLK_DEV_ZONED init_completion(&io->zone_wait); io->zone_pending_bio = NULL; io->bi_private = NULL; #endif } } return 0; } static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type, enum temp_type temp) { enum page_type btype = PAGE_TYPE_OF_BIO(type); struct f2fs_bio_info *io = sbi->write_io[btype] + temp; f2fs_down_write(&io->io_rwsem); if (!io->bio) goto unlock_out; /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { io->fio.type = META_FLUSH; io->bio->bi_opf |= REQ_META | REQ_PRIO | REQ_SYNC; if (!test_opt(sbi, NOBARRIER)) io->bio->bi_opf |= REQ_PREFLUSH | REQ_FUA; } __submit_merged_bio(io); unlock_out: f2fs_up_write(&io->io_rwsem); } static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, struct page *page, nid_t ino, enum page_type type, bool force) { enum temp_type temp; bool ret = true; for (temp = HOT; temp < NR_TEMP_TYPE; temp++) { if (!force) { enum page_type btype = PAGE_TYPE_OF_BIO(type); struct f2fs_bio_info *io = sbi->write_io[btype] + temp; f2fs_down_read(&io->io_rwsem); ret = __has_merged_page(io->bio, inode, page, ino); f2fs_up_read(&io->io_rwsem); } if (ret) __f2fs_submit_merged_write(sbi, type, temp); /* TODO: use HOT temp only for meta pages now. */ if (type >= META) break; } } void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type) { __submit_merged_write_cond(sbi, NULL, NULL, 0, type, true); } void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, struct page *page, nid_t ino, enum page_type type) { __submit_merged_write_cond(sbi, inode, page, ino, type, false); } void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi) { f2fs_submit_merged_write(sbi, DATA); f2fs_submit_merged_write(sbi, NODE); f2fs_submit_merged_write(sbi, META); } /* * Fill the locked page with data located in the block address. * A caller needs to unlock the page on failure. */ int f2fs_submit_page_bio(struct f2fs_io_info *fio) { struct bio *bio; struct page *page = fio->encrypted_page ? fio->encrypted_page : fio->page; if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, fio->is_por ? META_POR : (__is_meta_io(fio) ? META_GENERIC : DATA_GENERIC_ENHANCE))) return -EFSCORRUPTED; trace_f2fs_submit_page_bio(page, fio); /* Allocate a new bio */ bio = __bio_alloc(fio, 1); f2fs_set_bio_crypt_ctx(bio, fio->page->mapping->host, page_folio(fio->page)->index, fio, GFP_NOIO); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_put(bio); return -EFAULT; } if (fio->io_wbc && !is_read_io(fio->op)) wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page), PAGE_SIZE); inc_page_count(fio->sbi, is_read_io(fio->op) ? __read_io_type(page) : WB_DATA_TYPE(fio->page, false)); if (is_read_io(bio_op(bio))) f2fs_submit_read_bio(fio->sbi, bio, fio->type); else f2fs_submit_write_bio(fio->sbi, bio, fio->type); return 0; } static bool page_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio, block_t last_blkaddr, block_t cur_blkaddr) { if (unlikely(sbi->max_io_bytes && bio->bi_iter.bi_size >= sbi->max_io_bytes)) return false; if (last_blkaddr + 1 != cur_blkaddr) return false; return bio->bi_bdev == f2fs_target_device(sbi, cur_blkaddr, NULL); } static bool io_type_is_mergeable(struct f2fs_bio_info *io, struct f2fs_io_info *fio) { if (io->fio.op != fio->op) return false; return io->fio.op_flags == fio->op_flags; } static bool io_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio, struct f2fs_bio_info *io, struct f2fs_io_info *fio, block_t last_blkaddr, block_t cur_blkaddr) { if (!page_is_mergeable(sbi, bio, last_blkaddr, cur_blkaddr)) return false; return io_type_is_mergeable(io, fio); } static void add_bio_entry(struct f2fs_sb_info *sbi, struct bio *bio, struct page *page, enum temp_type temp) { struct f2fs_bio_info *io = sbi->write_io[DATA] + temp; struct bio_entry *be; be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS, true, NULL); be->bio = bio; bio_get(bio); if (bio_add_page(bio, page, PAGE_SIZE, 0) != PAGE_SIZE) f2fs_bug_on(sbi, 1); f2fs_down_write(&io->bio_list_lock); list_add_tail(&be->list, &io->bio_list); f2fs_up_write(&io->bio_list_lock); } static void del_bio_entry(struct bio_entry *be) { list_del(&be->list); kmem_cache_free(bio_entry_slab, be); } static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio, struct page *page) { struct f2fs_sb_info *sbi = fio->sbi; enum temp_type temp; bool found = false; int ret = -EAGAIN; for (temp = HOT; temp < NR_TEMP_TYPE && !found; temp++) { struct f2fs_bio_info *io = sbi->write_io[DATA] + temp; struct list_head *head = &io->bio_list; struct bio_entry *be; f2fs_down_write(&io->bio_list_lock); list_for_each_entry(be, head, list) { if (be->bio != *bio) continue; found = true; f2fs_bug_on(sbi, !page_is_mergeable(sbi, *bio, *fio->last_block, fio->new_blkaddr)); if (f2fs_crypt_mergeable_bio(*bio, fio->page->mapping->host, page_folio(fio->page)->index, fio) && bio_add_page(*bio, page, PAGE_SIZE, 0) == PAGE_SIZE) { ret = 0; break; } /* page can't be merged into bio; submit the bio */ del_bio_entry(be); f2fs_submit_write_bio(sbi, *bio, DATA); break; } f2fs_up_write(&io->bio_list_lock); } if (ret) { bio_put(*bio); *bio = NULL; } return ret; } void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, struct bio **bio, struct page *page) { enum temp_type temp; bool found = false; struct bio *target = bio ? *bio : NULL; f2fs_bug_on(sbi, !target && !page); for (temp = HOT; temp < NR_TEMP_TYPE && !found; temp++) { struct f2fs_bio_info *io = sbi->write_io[DATA] + temp; struct list_head *head = &io->bio_list; struct bio_entry *be; if (list_empty(head)) continue; f2fs_down_read(&io->bio_list_lock); list_for_each_entry(be, head, list) { if (target) found = (target == be->bio); else found = __has_merged_page(be->bio, NULL, page, 0); if (found) break; } f2fs_up_read(&io->bio_list_lock); if (!found) continue; found = false; f2fs_down_write(&io->bio_list_lock); list_for_each_entry(be, head, list) { if (target) found = (target == be->bio); else found = __has_merged_page(be->bio, NULL, page, 0); if (found) { target = be->bio; del_bio_entry(be); break; } } f2fs_up_write(&io->bio_list_lock); } if (found) f2fs_submit_write_bio(sbi, target, DATA); if (bio && *bio) { bio_put(*bio); *bio = NULL; } } int f2fs_merge_page_bio(struct f2fs_io_info *fio) { struct bio *bio = *fio->bio; struct page *page = fio->encrypted_page ? fio->encrypted_page : fio->page; if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) return -EFSCORRUPTED; trace_f2fs_submit_page_bio(page, fio); if (bio && !page_is_mergeable(fio->sbi, bio, *fio->last_block, fio->new_blkaddr)) f2fs_submit_merged_ipu_write(fio->sbi, &bio, NULL); alloc_new: if (!bio) { bio = __bio_alloc(fio, BIO_MAX_VECS); f2fs_set_bio_crypt_ctx(bio, fio->page->mapping->host, page_folio(fio->page)->index, fio, GFP_NOIO); add_bio_entry(fio->sbi, bio, page, fio->temp); } else { if (add_ipu_page(fio, &bio, page)) goto alloc_new; } if (fio->io_wbc) wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page), PAGE_SIZE); inc_page_count(fio->sbi, WB_DATA_TYPE(page, false)); *fio->last_block = fio->new_blkaddr; *fio->bio = bio; return 0; } #ifdef CONFIG_BLK_DEV_ZONED static bool is_end_zone_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr) { struct block_device *bdev = sbi->sb->s_bdev; int devi = 0; if (f2fs_is_multi_device(sbi)) { devi = f2fs_target_device_index(sbi, blkaddr); if (blkaddr < FDEV(devi).start_blk || blkaddr > FDEV(devi).end_blk) { f2fs_err(sbi, "Invalid block %x", blkaddr); return false; } blkaddr -= FDEV(devi).start_blk; bdev = FDEV(devi).bdev; } return bdev_is_zoned(bdev) && f2fs_blkz_is_seq(sbi, devi, blkaddr) && (blkaddr % sbi->blocks_per_blkz == sbi->blocks_per_blkz - 1); } #endif void f2fs_submit_page_write(struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = fio->sbi; enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); struct f2fs_bio_info *io = sbi->write_io[btype] + fio->temp; struct page *bio_page; enum count_type type; f2fs_bug_on(sbi, is_read_io(fio->op)); f2fs_down_write(&io->io_rwsem); next: #ifdef CONFIG_BLK_DEV_ZONED if (f2fs_sb_has_blkzoned(sbi) && btype < META && io->zone_pending_bio) { wait_for_completion_io(&io->zone_wait); bio_put(io->zone_pending_bio); io->zone_pending_bio = NULL; io->bi_private = NULL; } #endif if (fio->in_list) { spin_lock(&io->io_lock); if (list_empty(&io->io_list)) { spin_unlock(&io->io_lock); goto out; } fio = list_first_entry(&io->io_list, struct f2fs_io_info, list); list_del(&fio->list); spin_unlock(&io->io_lock); } verify_fio_blkaddr(fio); if (fio->encrypted_page) bio_page = fio->encrypted_page; else if (fio->compressed_page) bio_page = fio->compressed_page; else bio_page = fio->page; /* set submitted = true as a return value */ fio->submitted = 1; type = WB_DATA_TYPE(bio_page, fio->compressed_page); inc_page_count(sbi, type); if (io->bio && (!io_is_mergeable(sbi, io->bio, io, fio, io->last_block_in_bio, fio->new_blkaddr) || !f2fs_crypt_mergeable_bio(io->bio, fio->page->mapping->host, page_folio(bio_page)->index, fio))) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { io->bio = __bio_alloc(fio, BIO_MAX_VECS); f2fs_set_bio_crypt_ctx(io->bio, fio->page->mapping->host, page_folio(bio_page)->index, fio, GFP_NOIO); io->fio = *fio; } if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < PAGE_SIZE) { __submit_merged_bio(io); goto alloc_new; } if (fio->io_wbc) wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page), PAGE_SIZE); io->last_block_in_bio = fio->new_blkaddr; trace_f2fs_submit_page_write(fio->page, fio); #ifdef CONFIG_BLK_DEV_ZONED if (f2fs_sb_has_blkzoned(sbi) && btype < META && is_end_zone_blkaddr(sbi, fio->new_blkaddr)) { bio_get(io->bio); reinit_completion(&io->zone_wait); io->bi_private = io->bio->bi_private; io->bio->bi_private = io; io->bio->bi_end_io = f2fs_zone_write_end_io; io->zone_pending_bio = io->bio; __submit_merged_bio(io); } #endif if (fio->in_list) goto next; out: if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || !f2fs_is_checkpoint_ready(sbi)) __submit_merged_bio(io); f2fs_up_write(&io->io_rwsem); } static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, unsigned nr_pages, blk_opf_t op_flag, pgoff_t first_idx, bool for_write) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio; struct bio_post_read_ctx *ctx = NULL; unsigned int post_read_steps = 0; sector_t sector; struct block_device *bdev = f2fs_target_device(sbi, blkaddr, §or); bio = bio_alloc_bioset(bdev, bio_max_segs(nr_pages), REQ_OP_READ | op_flag, for_write ? GFP_NOIO : GFP_KERNEL, &f2fs_bioset); if (!bio) return ERR_PTR(-ENOMEM); bio->bi_iter.bi_sector = sector; f2fs_set_bio_crypt_ctx(bio, inode, first_idx, NULL, GFP_NOFS); bio->bi_end_io = f2fs_read_end_io; if (fscrypt_inode_uses_fs_layer_crypto(inode)) post_read_steps |= STEP_DECRYPT; if (f2fs_need_verity(inode, first_idx)) post_read_steps |= STEP_VERITY; /* * STEP_DECOMPRESS is handled specially, since a compressed file might * contain both compressed and uncompressed clusters. We'll allocate a * bio_post_read_ctx if the file is compressed, but the caller is * responsible for enabling STEP_DECOMPRESS if it's actually needed. */ if (post_read_steps || f2fs_compressed_file(inode)) { /* Due to the mempool, this never fails. */ ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); ctx->bio = bio; ctx->sbi = sbi; ctx->enabled_steps = post_read_steps; ctx->fs_blkaddr = blkaddr; ctx->decompression_attempted = false; bio->bi_private = ctx; } iostat_alloc_and_bind_ctx(sbi, bio, ctx); return bio; } /* This can handle encryption stuffs */ static int f2fs_submit_page_read(struct inode *inode, struct folio *folio, block_t blkaddr, blk_opf_t op_flags, bool for_write) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio; bio = f2fs_grab_read_bio(inode, blkaddr, 1, op_flags, folio->index, for_write); if (IS_ERR(bio)) return PTR_ERR(bio); /* wait for GCed page writeback via META_MAPPING */ f2fs_wait_on_block_writeback(inode, blkaddr); if (!bio_add_folio(bio, folio, PAGE_SIZE, 0)) { iostat_update_and_unbind_ctx(bio); if (bio->bi_private) mempool_free(bio->bi_private, bio_post_read_ctx_pool); bio_put(bio); return -EFAULT; } inc_page_count(sbi, F2FS_RD_DATA); f2fs_update_iostat(sbi, NULL, FS_DATA_READ_IO, F2FS_BLKSIZE); f2fs_submit_read_bio(sbi, bio, DATA); return 0; } static void __set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) { __le32 *addr = get_dnode_addr(dn->inode, dn->node_page); dn->data_blkaddr = blkaddr; addr[dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); } /* * Lock ordering for the change of data block address: * ->data_page * ->node_page * update block addresses in the node page */ void f2fs_set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) { f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true); __set_data_blkaddr(dn, blkaddr); if (set_page_dirty(dn->node_page)) dn->node_changed = true; } void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) { f2fs_set_data_blkaddr(dn, blkaddr); f2fs_update_read_extent_cache(dn); } /* dn->ofs_in_node will be returned with up-to-date last block pointer */ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); int err; if (!count) return 0; if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; err = inc_valid_block_count(sbi, dn->inode, &count, true); if (unlikely(err)) return err; trace_f2fs_reserve_new_blocks(dn->inode, dn->nid, dn->ofs_in_node, count); f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true); for (; count > 0; dn->ofs_in_node++) { block_t blkaddr = f2fs_data_blkaddr(dn); if (blkaddr == NULL_ADDR) { __set_data_blkaddr(dn, NEW_ADDR); count--; } } if (set_page_dirty(dn->node_page)) dn->node_changed = true; return 0; } /* Should keep dn->ofs_in_node unchanged */ int f2fs_reserve_new_block(struct dnode_of_data *dn) { unsigned int ofs_in_node = dn->ofs_in_node; int ret; ret = f2fs_reserve_new_blocks(dn, 1); dn->ofs_in_node = ofs_in_node; return ret; } int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) { bool need_put = dn->inode_page ? false : true; int err; err = f2fs_get_dnode_of_data(dn, index, ALLOC_NODE); if (err) return err; if (dn->data_blkaddr == NULL_ADDR) err = f2fs_reserve_new_block(dn); if (err || need_put) f2fs_put_dnode(dn); return err; } struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs) { struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; struct page *page; int err; page = f2fs_grab_cache_page(mapping, index, for_write); if (!page) return ERR_PTR(-ENOMEM); if (f2fs_lookup_read_extent_cache_block(inode, index, &dn.data_blkaddr)) { if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr, DATA_GENERIC_ENHANCE_READ)) { err = -EFSCORRUPTED; goto put_err; } goto got_it; } set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err) { if (err == -ENOENT && next_pgofs) *next_pgofs = f2fs_get_next_page_offset(&dn, index); goto put_err; } f2fs_put_dnode(&dn); if (unlikely(dn.data_blkaddr == NULL_ADDR)) { err = -ENOENT; if (next_pgofs) *next_pgofs = index + 1; goto put_err; } if (dn.data_blkaddr != NEW_ADDR && !f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; goto put_err; } got_it: if (PageUptodate(page)) { unlock_page(page); return page; } /* * A new dentry page is allocated but not able to be written, since its * new inode page couldn't be allocated due to -ENOSPC. * In such the case, its blkaddr can be remained as NEW_ADDR. * see, f2fs_add_link -> f2fs_get_new_data_page -> * f2fs_init_inode_metadata. */ if (dn.data_blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_SIZE); if (!PageUptodate(page)) SetPageUptodate(page); unlock_page(page); return page; } err = f2fs_submit_page_read(inode, page_folio(page), dn.data_blkaddr, op_flags, for_write); if (err) goto put_err; return page; put_err: f2fs_put_page(page, 1); return ERR_PTR(err); } struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index, pgoff_t *next_pgofs) { struct address_space *mapping = inode->i_mapping; struct page *page; page = find_get_page(mapping, index); if (page && PageUptodate(page)) return page; f2fs_put_page(page, 0); page = f2fs_get_read_data_page(inode, index, 0, false, next_pgofs); if (IS_ERR(page)) return page; if (PageUptodate(page)) return page; wait_on_page_locked(page); if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 0); return ERR_PTR(-EIO); } return page; } /* * If it tries to access a hole, return an error. * Because, the callers, functions in dir.c and GC, should be able to know * whether this page exists or not. */ struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, bool for_write) { struct address_space *mapping = inode->i_mapping; struct page *page; page = f2fs_get_read_data_page(inode, index, 0, for_write, NULL); if (IS_ERR(page)) return page; /* wait for read completion */ lock_page(page); if (unlikely(page->mapping != mapping || !PageUptodate(page))) { f2fs_put_page(page, 1); return ERR_PTR(-EIO); } return page; } /* * Caller ensures that this data page is never allocated. * A new zero-filled data page is allocated in the page cache. * * Also, caller should grab and release a rwsem by calling f2fs_lock_op() and * f2fs_unlock_op(). * Note that, ipage is set only by make_empty_dir, and if any error occur, * ipage should be released by this function. */ struct page *f2fs_get_new_data_page(struct inode *inode, struct page *ipage, pgoff_t index, bool new_i_size) { struct address_space *mapping = inode->i_mapping; struct page *page; struct dnode_of_data dn; int err; page = f2fs_grab_cache_page(mapping, index, true); if (!page) { /* * before exiting, we should make sure ipage will be released * if any error occur. */ f2fs_put_page(ipage, 1); return ERR_PTR(-ENOMEM); } set_new_dnode(&dn, inode, ipage, NULL, 0); err = f2fs_reserve_block(&dn, index); if (err) { f2fs_put_page(page, 1); return ERR_PTR(err); } if (!ipage) f2fs_put_dnode(&dn); if (PageUptodate(page)) goto got_it; if (dn.data_blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_SIZE); if (!PageUptodate(page)) SetPageUptodate(page); } else { f2fs_put_page(page, 1); /* if ipage exists, blkaddr should be NEW_ADDR */ f2fs_bug_on(F2FS_I_SB(inode), ipage); page = f2fs_get_lock_data_page(inode, index, true); if (IS_ERR(page)) return page; } got_it: if (new_i_size && i_size_read(inode) < ((loff_t)(index + 1) << PAGE_SHIFT)) f2fs_i_size_write(inode, ((loff_t)(index + 1) << PAGE_SHIFT)); return page; } static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct f2fs_summary sum; struct node_info ni; block_t old_blkaddr; blkcnt_t count = 1; int err; if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; err = f2fs_get_node_info(sbi, dn->nid, &ni, false); if (err) return err; dn->data_blkaddr = f2fs_data_blkaddr(dn); if (dn->data_blkaddr == NULL_ADDR) { err = inc_valid_block_count(sbi, dn->inode, &count, true); if (unlikely(err)) return err; } set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); old_blkaddr = dn->data_blkaddr; err = f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr, &sum, seg_type, NULL); if (err) return err; if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) f2fs_invalidate_internal_cache(sbi, old_blkaddr); f2fs_update_data_blkaddr(dn, dn->data_blkaddr); return 0; } static void f2fs_map_lock(struct f2fs_sb_info *sbi, int flag) { if (flag == F2FS_GET_BLOCK_PRE_AIO) f2fs_down_read(&sbi->node_change); else f2fs_lock_op(sbi); } static void f2fs_map_unlock(struct f2fs_sb_info *sbi, int flag) { if (flag == F2FS_GET_BLOCK_PRE_AIO) f2fs_up_read(&sbi->node_change); else f2fs_unlock_op(sbi); } int f2fs_get_block_locked(struct dnode_of_data *dn, pgoff_t index) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); int err = 0; f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO); if (!f2fs_lookup_read_extent_cache_block(dn->inode, index, &dn->data_blkaddr)) err = f2fs_reserve_block(dn, index); f2fs_map_unlock(sbi, F2FS_GET_BLOCK_PRE_AIO); return err; } static int f2fs_map_no_dnode(struct inode *inode, struct f2fs_map_blocks *map, struct dnode_of_data *dn, pgoff_t pgoff) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); /* * There is one exceptional case that read_node_page() may return * -ENOENT due to filesystem has been shutdown or cp_error, return * -EIO in that case. */ if (map->m_may_create && (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || f2fs_cp_error(sbi))) return -EIO; if (map->m_next_pgofs) *map->m_next_pgofs = f2fs_get_next_page_offset(dn, pgoff); if (map->m_next_extent) *map->m_next_extent = f2fs_get_next_page_offset(dn, pgoff); return 0; } static bool f2fs_map_blocks_cached(struct inode *inode, struct f2fs_map_blocks *map, int flag) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int maxblocks = map->m_len; pgoff_t pgoff = (pgoff_t)map->m_lblk; struct extent_info ei = {}; if (!f2fs_lookup_read_extent_cache(inode, pgoff, &ei)) return false; map->m_pblk = ei.blk + pgoff - ei.fofs; map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgoff); map->m_flags = F2FS_MAP_MAPPED; if (map->m_next_extent) *map->m_next_extent = pgoff + map->m_len; /* for hardware encryption, but to avoid potential issue in future */ if (flag == F2FS_GET_BLOCK_DIO) f2fs_wait_on_block_writeback_range(inode, map->m_pblk, map->m_len); if (f2fs_allow_multi_device_dio(sbi, flag)) { int bidx = f2fs_target_device_index(sbi, map->m_pblk); struct f2fs_dev_info *dev = &sbi->devs[bidx]; map->m_bdev = dev->bdev; map->m_pblk -= dev->start_blk; map->m_len = min(map->m_len, dev->end_blk + 1 - map->m_pblk); } else { map->m_bdev = inode->i_sb->s_bdev; } return true; } static bool map_is_mergeable(struct f2fs_sb_info *sbi, struct f2fs_map_blocks *map, block_t blkaddr, int flag, int bidx, int ofs) { if (map->m_multidev_dio && map->m_bdev != FDEV(bidx).bdev) return false; if (map->m_pblk != NEW_ADDR && blkaddr == (map->m_pblk + ofs)) return true; if (map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) return true; if (flag == F2FS_GET_BLOCK_PRE_DIO) return true; if (flag == F2FS_GET_BLOCK_DIO && map->m_pblk == NULL_ADDR && blkaddr == NULL_ADDR) return true; return false; } /* * f2fs_map_blocks() tries to find or build mapping relationship which * maps continuous logical blocks to physical blocks, and return such * info via f2fs_map_blocks structure. */ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag) { unsigned int maxblocks = map->m_len; struct dnode_of_data dn; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int mode = map->m_may_create ? ALLOC_NODE : LOOKUP_NODE; pgoff_t pgofs, end_offset, end; int err = 0, ofs = 1; unsigned int ofs_in_node, last_ofs_in_node; blkcnt_t prealloc; block_t blkaddr; unsigned int start_pgofs; int bidx = 0; bool is_hole; if (!maxblocks) return 0; if (!map->m_may_create && f2fs_map_blocks_cached(inode, map, flag)) goto out; map->m_bdev = inode->i_sb->s_bdev; map->m_multidev_dio = f2fs_allow_multi_device_dio(F2FS_I_SB(inode), flag); map->m_len = 0; map->m_flags = 0; /* it only supports block size == page size */ pgofs = (pgoff_t)map->m_lblk; end = pgofs + maxblocks; next_dnode: if (map->m_may_create) f2fs_map_lock(sbi, flag); /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_dnode_of_data(&dn, pgofs, mode); if (err) { if (flag == F2FS_GET_BLOCK_BMAP) map->m_pblk = 0; if (err == -ENOENT) err = f2fs_map_no_dnode(inode, map, &dn, pgofs); goto unlock_out; } start_pgofs = pgofs; prealloc = 0; last_ofs_in_node = ofs_in_node = dn.ofs_in_node; end_offset = ADDRS_PER_PAGE(dn.node_page, inode); next_block: blkaddr = f2fs_data_blkaddr(&dn); is_hole = !__is_valid_data_blkaddr(blkaddr); if (!is_hole && !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; goto sync_out; } /* use out-place-update for direct IO under LFS mode */ if (map->m_may_create && (is_hole || (flag == F2FS_GET_BLOCK_DIO && f2fs_lfs_mode(sbi) && !f2fs_is_pinned_file(inode)))) { if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; goto sync_out; } switch (flag) { case F2FS_GET_BLOCK_PRE_AIO: if (blkaddr == NULL_ADDR) { prealloc++; last_ofs_in_node = dn.ofs_in_node; } break; case F2FS_GET_BLOCK_PRE_DIO: case F2FS_GET_BLOCK_DIO: err = __allocate_data_block(&dn, map->m_seg_type); if (err) goto sync_out; if (flag == F2FS_GET_BLOCK_PRE_DIO) file_need_truncate(inode); set_inode_flag(inode, FI_APPEND_WRITE); break; default: WARN_ON_ONCE(1); err = -EIO; goto sync_out; } blkaddr = dn.data_blkaddr; if (is_hole) map->m_flags |= F2FS_MAP_NEW; } else if (is_hole) { if (f2fs_compressed_file(inode) && f2fs_sanity_check_cluster(&dn)) { err = -EFSCORRUPTED; f2fs_handle_error(sbi, ERROR_CORRUPTED_CLUSTER); goto sync_out; } switch (flag) { case F2FS_GET_BLOCK_PRECACHE: goto sync_out; case F2FS_GET_BLOCK_BMAP: map->m_pblk = 0; goto sync_out; case F2FS_GET_BLOCK_FIEMAP: if (blkaddr == NULL_ADDR) { if (map->m_next_pgofs) *map->m_next_pgofs = pgofs + 1; goto sync_out; } break; case F2FS_GET_BLOCK_DIO: if (map->m_next_pgofs) *map->m_next_pgofs = pgofs + 1; break; default: /* for defragment case */ if (map->m_next_pgofs) *map->m_next_pgofs = pgofs + 1; goto sync_out; } } if (flag == F2FS_GET_BLOCK_PRE_AIO) goto skip; if (map->m_multidev_dio) bidx = f2fs_target_device_index(sbi, blkaddr); if (map->m_len == 0) { /* reserved delalloc block should be mapped for fiemap. */ if (blkaddr == NEW_ADDR) map->m_flags |= F2FS_MAP_DELALLOC; /* DIO READ and hole case, should not map the blocks. */ if (!(flag == F2FS_GET_BLOCK_DIO && is_hole && !map->m_may_create)) map->m_flags |= F2FS_MAP_MAPPED; map->m_pblk = blkaddr; map->m_len = 1; if (map->m_multidev_dio) map->m_bdev = FDEV(bidx).bdev; } else if (map_is_mergeable(sbi, map, blkaddr, flag, bidx, ofs)) { ofs++; map->m_len++; } else { goto sync_out; } skip: dn.ofs_in_node++; pgofs++; /* preallocate blocks in batch for one dnode page */ if (flag == F2FS_GET_BLOCK_PRE_AIO && (pgofs == end || dn.ofs_in_node == end_offset)) { dn.ofs_in_node = ofs_in_node; err = f2fs_reserve_new_blocks(&dn, prealloc); if (err) goto sync_out; map->m_len += dn.ofs_in_node - ofs_in_node; if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) { err = -ENOSPC; goto sync_out; } dn.ofs_in_node = end_offset; } if (flag == F2FS_GET_BLOCK_DIO && f2fs_lfs_mode(sbi) && map->m_may_create) { /* the next block to be allocated may not be contiguous. */ if (GET_SEGOFF_FROM_SEG0(sbi, blkaddr) % BLKS_PER_SEC(sbi) == CAP_BLKS_PER_SEC(sbi) - 1) goto sync_out; } if (pgofs >= end) goto sync_out; else if (dn.ofs_in_node < end_offset) goto next_block; if (flag == F2FS_GET_BLOCK_PRECACHE) { if (map->m_flags & F2FS_MAP_MAPPED) { unsigned int ofs = start_pgofs - map->m_lblk; f2fs_update_read_extent_cache_range(&dn, start_pgofs, map->m_pblk + ofs, map->m_len - ofs); } } f2fs_put_dnode(&dn); if (map->m_may_create) { f2fs_map_unlock(sbi, flag); f2fs_balance_fs(sbi, dn.node_changed); } goto next_dnode; sync_out: if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED) { /* * for hardware encryption, but to avoid potential issue * in future */ f2fs_wait_on_block_writeback_range(inode, map->m_pblk, map->m_len); if (map->m_multidev_dio) { block_t blk_addr = map->m_pblk; bidx = f2fs_target_device_index(sbi, map->m_pblk); map->m_bdev = FDEV(bidx).bdev; map->m_pblk -= FDEV(bidx).start_blk; if (map->m_may_create) f2fs_update_device_state(sbi, inode->i_ino, blk_addr, map->m_len); f2fs_bug_on(sbi, blk_addr + map->m_len > FDEV(bidx).end_blk + 1); } } if (flag == F2FS_GET_BLOCK_PRECACHE) { if (map->m_flags & F2FS_MAP_MAPPED) { unsigned int ofs = start_pgofs - map->m_lblk; f2fs_update_read_extent_cache_range(&dn, start_pgofs, map->m_pblk + ofs, map->m_len - ofs); } if (map->m_next_extent) *map->m_next_extent = pgofs + 1; } f2fs_put_dnode(&dn); unlock_out: if (map->m_may_create) { f2fs_map_unlock(sbi, flag); f2fs_balance_fs(sbi, dn.node_changed); } out: trace_f2fs_map_blocks(inode, map, flag, err); return err; } bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len) { struct f2fs_map_blocks map; block_t last_lblk; int err; if (pos + len > i_size_read(inode)) return false; map.m_lblk = F2FS_BYTES_TO_BLK(pos); map.m_next_pgofs = NULL; map.m_next_extent = NULL; map.m_seg_type = NO_CHECK_TYPE; map.m_may_create = false; last_lblk = F2FS_BLK_ALIGN(pos + len); while (map.m_lblk < last_lblk) { map.m_len = last_lblk - map.m_lblk; err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DEFAULT); if (err || map.m_len == 0) return false; map.m_lblk += map.m_len; } return true; } static int f2fs_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *page; struct node_info ni; __u64 phys = 0, len; __u32 flags; nid_t xnid = F2FS_I(inode)->i_xattr_nid; int err = 0; if (f2fs_has_inline_xattr(inode)) { int offset; page = f2fs_grab_cache_page(NODE_MAPPING(sbi), inode->i_ino, false); if (!page) return -ENOMEM; err = f2fs_get_node_info(sbi, inode->i_ino, &ni, false); if (err) { f2fs_put_page(page, 1); return err; } phys = F2FS_BLK_TO_BYTES(ni.blk_addr); offset = offsetof(struct f2fs_inode, i_addr) + sizeof(__le32) * (DEF_ADDRS_PER_INODE - get_inline_xattr_addrs(inode)); phys += offset; len = inline_xattr_size(inode); f2fs_put_page(page, 1); flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED; if (!xnid) flags |= FIEMAP_EXTENT_LAST; err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags); trace_f2fs_fiemap(inode, 0, phys, len, flags, err); if (err) return err; } if (xnid) { page = f2fs_grab_cache_page(NODE_MAPPING(sbi), xnid, false); if (!page) return -ENOMEM; err = f2fs_get_node_info(sbi, xnid, &ni, false); if (err) { f2fs_put_page(page, 1); return err; } phys = F2FS_BLK_TO_BYTES(ni.blk_addr); len = inode->i_sb->s_blocksize; f2fs_put_page(page, 1); flags = FIEMAP_EXTENT_LAST; } if (phys) { err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags); trace_f2fs_fiemap(inode, 0, phys, len, flags, err); } return (err < 0 ? err : 0); } int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { struct f2fs_map_blocks map; sector_t start_blk, last_blk, blk_len, max_len; pgoff_t next_pgofs; u64 logical = 0, phys = 0, size = 0; u32 flags = 0; int ret = 0; bool compr_cluster = false, compr_appended; unsigned int cluster_size = F2FS_I(inode)->i_cluster_size; unsigned int count_in_cluster = 0; loff_t maxbytes; if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { ret = f2fs_precache_extents(inode); if (ret) return ret; } ret = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_XATTR); if (ret) return ret; inode_lock_shared(inode); maxbytes = F2FS_BLK_TO_BYTES(max_file_blocks(inode)); if (start > maxbytes) { ret = -EFBIG; goto out; } if (len > maxbytes || (maxbytes - len) < start) len = maxbytes - start; if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { ret = f2fs_xattr_fiemap(inode, fieinfo); goto out; } if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) { ret = f2fs_inline_data_fiemap(inode, fieinfo, start, len); if (ret != -EAGAIN) goto out; } start_blk = F2FS_BYTES_TO_BLK(start); last_blk = F2FS_BYTES_TO_BLK(start + len - 1); blk_len = last_blk - start_blk + 1; max_len = F2FS_BYTES_TO_BLK(maxbytes) - start_blk; next: memset(&map, 0, sizeof(map)); map.m_lblk = start_blk; map.m_len = blk_len; map.m_next_pgofs = &next_pgofs; map.m_seg_type = NO_CHECK_TYPE; if (compr_cluster) { map.m_lblk += 1; map.m_len = cluster_size - count_in_cluster; } ret = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_FIEMAP); if (ret) goto out; /* HOLE */ if (!compr_cluster && !(map.m_flags & F2FS_MAP_FLAGS)) { start_blk = next_pgofs; if (F2FS_BLK_TO_BYTES(start_blk) < maxbytes) goto prep_next; flags |= FIEMAP_EXTENT_LAST; } /* * current extent may cross boundary of inquiry, increase len to * requery. */ if (!compr_cluster && (map.m_flags & F2FS_MAP_MAPPED) && map.m_lblk + map.m_len - 1 == last_blk && blk_len != max_len) { blk_len = max_len; goto next; } compr_appended = false; /* In a case of compressed cluster, append this to the last extent */ if (compr_cluster && ((map.m_flags & F2FS_MAP_DELALLOC) || !(map.m_flags & F2FS_MAP_FLAGS))) { compr_appended = true; goto skip_fill; } if (size) { flags |= FIEMAP_EXTENT_MERGED; if (IS_ENCRYPTED(inode)) flags |= FIEMAP_EXTENT_DATA_ENCRYPTED; ret = fiemap_fill_next_extent(fieinfo, logical, phys, size, flags); trace_f2fs_fiemap(inode, logical, phys, size, flags, ret); if (ret) goto out; size = 0; } if (start_blk > last_blk) goto out; skip_fill: if (map.m_pblk == COMPRESS_ADDR) { compr_cluster = true; count_in_cluster = 1; } else if (compr_appended) { unsigned int appended_blks = cluster_size - count_in_cluster + 1; size += F2FS_BLK_TO_BYTES(appended_blks); start_blk += appended_blks; compr_cluster = false; } else { logical = F2FS_BLK_TO_BYTES(start_blk); phys = __is_valid_data_blkaddr(map.m_pblk) ? F2FS_BLK_TO_BYTES(map.m_pblk) : 0; size = F2FS_BLK_TO_BYTES(map.m_len); flags = 0; if (compr_cluster) { flags = FIEMAP_EXTENT_ENCODED; count_in_cluster += map.m_len; if (count_in_cluster == cluster_size) { compr_cluster = false; size += F2FS_BLKSIZE; } } else if (map.m_flags & F2FS_MAP_DELALLOC) { flags = FIEMAP_EXTENT_UNWRITTEN; } start_blk += F2FS_BYTES_TO_BLK(size); } prep_next: cond_resched(); if (fatal_signal_pending(current)) ret = -EINTR; else goto next; out: if (ret == 1) ret = 0; inode_unlock_shared(inode); return ret; } static inline loff_t f2fs_readpage_limit(struct inode *inode) { if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode)) return F2FS_BLK_TO_BYTES(max_file_blocks(inode)); return i_size_read(inode); } static inline blk_opf_t f2fs_ra_op_flags(struct readahead_control *rac) { return rac ? REQ_RAHEAD : 0; } static int f2fs_read_single_page(struct inode *inode, struct folio *folio, unsigned nr_pages, struct f2fs_map_blocks *map, struct bio **bio_ret, sector_t *last_block_in_bio, struct readahead_control *rac) { struct bio *bio = *bio_ret; const unsigned int blocksize = F2FS_BLKSIZE; sector_t block_in_file; sector_t last_block; sector_t last_block_in_file; sector_t block_nr; pgoff_t index = folio_index(folio); int ret = 0; block_in_file = (sector_t)index; last_block = block_in_file + nr_pages; last_block_in_file = F2FS_BYTES_TO_BLK(f2fs_readpage_limit(inode) + blocksize - 1); if (last_block > last_block_in_file) last_block = last_block_in_file; /* just zeroing out page which is beyond EOF */ if (block_in_file >= last_block) goto zero_out; /* * Map blocks using the previous result first. */ if ((map->m_flags & F2FS_MAP_MAPPED) && block_in_file > map->m_lblk && block_in_file < (map->m_lblk + map->m_len)) goto got_it; /* * Then do more f2fs_map_blocks() calls until we are * done with this page. */ map->m_lblk = block_in_file; map->m_len = last_block - block_in_file; ret = f2fs_map_blocks(inode, map, F2FS_GET_BLOCK_DEFAULT); if (ret) goto out; got_it: if ((map->m_flags & F2FS_MAP_MAPPED)) { block_nr = map->m_pblk + block_in_file - map->m_lblk; folio_set_mappedtodisk(folio); if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr, DATA_GENERIC_ENHANCE_READ)) { ret = -EFSCORRUPTED; goto out; } } else { zero_out: folio_zero_segment(folio, 0, folio_size(folio)); if (f2fs_need_verity(inode, index) && !fsverity_verify_folio(folio)) { ret = -EIO; goto out; } if (!folio_test_uptodate(folio)) folio_mark_uptodate(folio); folio_unlock(folio); goto out; } /* * This page will go to BIO. Do we need to send this * BIO off first? */ if (bio && (!page_is_mergeable(F2FS_I_SB(inode), bio, *last_block_in_bio, block_nr) || !f2fs_crypt_mergeable_bio(bio, inode, index, NULL))) { submit_and_realloc: f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); bio = NULL; } if (bio == NULL) { bio = f2fs_grab_read_bio(inode, block_nr, nr_pages, f2fs_ra_op_flags(rac), index, false); if (IS_ERR(bio)) { ret = PTR_ERR(bio); bio = NULL; goto out; } } /* * If the page is under writeback, we need to wait for * its completion to see the correct decrypted data. */ f2fs_wait_on_block_writeback(inode, block_nr); if (!bio_add_folio(bio, folio, blocksize, 0)) goto submit_and_realloc; inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA); f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO, F2FS_BLKSIZE); *last_block_in_bio = block_nr; out: *bio_ret = bio; return ret; } #ifdef CONFIG_F2FS_FS_COMPRESSION int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, unsigned nr_pages, sector_t *last_block_in_bio, struct readahead_control *rac, bool for_write) { struct dnode_of_data dn; struct inode *inode = cc->inode; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio = *bio_ret; unsigned int start_idx = cc->cluster_idx << cc->log_cluster_size; sector_t last_block_in_file; const unsigned int blocksize = F2FS_BLKSIZE; struct decompress_io_ctx *dic = NULL; struct extent_info ei = {}; bool from_dnode = true; int i; int ret = 0; f2fs_bug_on(sbi, f2fs_cluster_is_empty(cc)); last_block_in_file = F2FS_BYTES_TO_BLK(f2fs_readpage_limit(inode) + blocksize - 1); /* get rid of pages beyond EOF */ for (i = 0; i < cc->cluster_size; i++) { struct page *page = cc->rpages[i]; struct folio *folio; if (!page) continue; folio = page_folio(page); if ((sector_t)folio->index >= last_block_in_file) { folio_zero_segment(folio, 0, folio_size(folio)); if (!folio_test_uptodate(folio)) folio_mark_uptodate(folio); } else if (!folio_test_uptodate(folio)) { continue; } folio_unlock(folio); if (for_write) folio_put(folio); cc->rpages[i] = NULL; cc->nr_rpages--; } /* we are done since all pages are beyond EOF */ if (f2fs_cluster_is_empty(cc)) goto out; if (f2fs_lookup_read_extent_cache(inode, start_idx, &ei)) from_dnode = false; if (!from_dnode) goto skip_reading_dnode; set_new_dnode(&dn, inode, NULL, NULL, 0); ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); if (ret) goto out; if (unlikely(f2fs_cp_error(sbi))) { ret = -EIO; goto out_put_dnode; } f2fs_bug_on(sbi, dn.data_blkaddr != COMPRESS_ADDR); skip_reading_dnode: for (i = 1; i < cc->cluster_size; i++) { block_t blkaddr; blkaddr = from_dnode ? data_blkaddr(dn.inode, dn.node_page, dn.ofs_in_node + i) : ei.blk + i - 1; if (!__is_valid_data_blkaddr(blkaddr)) break; if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC)) { ret = -EFAULT; goto out_put_dnode; } cc->nr_cpages++; if (!from_dnode && i >= ei.c_len) break; } /* nothing to decompress */ if (cc->nr_cpages == 0) { ret = 0; goto out_put_dnode; } dic = f2fs_alloc_dic(cc); if (IS_ERR(dic)) { ret = PTR_ERR(dic); goto out_put_dnode; } for (i = 0; i < cc->nr_cpages; i++) { struct folio *folio = page_folio(dic->cpages[i]); block_t blkaddr; struct bio_post_read_ctx *ctx; blkaddr = from_dnode ? data_blkaddr(dn.inode, dn.node_page, dn.ofs_in_node + i + 1) : ei.blk + i; f2fs_wait_on_block_writeback(inode, blkaddr); if (f2fs_load_compressed_page(sbi, folio_page(folio, 0), blkaddr)) { if (atomic_dec_and_test(&dic->remaining_pages)) { f2fs_decompress_cluster(dic, true); break; } continue; } if (bio && (!page_is_mergeable(sbi, bio, *last_block_in_bio, blkaddr) || !f2fs_crypt_mergeable_bio(bio, inode, folio->index, NULL))) { submit_and_realloc: f2fs_submit_read_bio(sbi, bio, DATA); bio = NULL; } if (!bio) { bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages, f2fs_ra_op_flags(rac), folio->index, for_write); if (IS_ERR(bio)) { ret = PTR_ERR(bio); f2fs_decompress_end_io(dic, ret, true); f2fs_put_dnode(&dn); *bio_ret = NULL; return ret; } } if (!bio_add_folio(bio, folio, blocksize, 0)) goto submit_and_realloc; ctx = get_post_read_ctx(bio); ctx->enabled_steps |= STEP_DECOMPRESS; refcount_inc(&dic->refcnt); inc_page_count(sbi, F2FS_RD_DATA); f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE); *last_block_in_bio = blkaddr; } if (from_dnode) f2fs_put_dnode(&dn); *bio_ret = bio; return 0; out_put_dnode: if (from_dnode) f2fs_put_dnode(&dn); out: for (i = 0; i < cc->cluster_size; i++) { if (cc->rpages[i]) { ClearPageUptodate(cc->rpages[i]); unlock_page(cc->rpages[i]); } } *bio_ret = bio; return ret; } #endif /* * This function was originally taken from fs/mpage.c, and customized for f2fs. * Major change was from block_size == page_size in f2fs by default. */ static int f2fs_mpage_readpages(struct inode *inode, struct readahead_control *rac, struct folio *folio) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; struct f2fs_map_blocks map; #ifdef CONFIG_F2FS_FS_COMPRESSION struct compress_ctx cc = { .inode = inode, .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, .cluster_size = F2FS_I(inode)->i_cluster_size, .cluster_idx = NULL_CLUSTER, .rpages = NULL, .cpages = NULL, .nr_rpages = 0, .nr_cpages = 0, }; pgoff_t nc_cluster_idx = NULL_CLUSTER; pgoff_t index; #endif unsigned nr_pages = rac ? readahead_count(rac) : 1; unsigned max_nr_pages = nr_pages; int ret = 0; map.m_pblk = 0; map.m_lblk = 0; map.m_len = 0; map.m_flags = 0; map.m_next_pgofs = NULL; map.m_next_extent = NULL; map.m_seg_type = NO_CHECK_TYPE; map.m_may_create = false; for (; nr_pages; nr_pages--) { if (rac) { folio = readahead_folio(rac); prefetchw(&folio->flags); } #ifdef CONFIG_F2FS_FS_COMPRESSION index = folio_index(folio); if (!f2fs_compressed_file(inode)) goto read_single_page; /* there are remained compressed pages, submit them */ if (!f2fs_cluster_can_merge_page(&cc, index)) { ret = f2fs_read_multi_pages(&cc, &bio, max_nr_pages, &last_block_in_bio, rac, false); f2fs_destroy_compress_ctx(&cc, false); if (ret) goto set_error_page; } if (cc.cluster_idx == NULL_CLUSTER) { if (nc_cluster_idx == index >> cc.log_cluster_size) goto read_single_page; ret = f2fs_is_compressed_cluster(inode, index); if (ret < 0) goto set_error_page; else if (!ret) { nc_cluster_idx = index >> cc.log_cluster_size; goto read_single_page; } nc_cluster_idx = NULL_CLUSTER; } ret = f2fs_init_compress_ctx(&cc); if (ret) goto set_error_page; f2fs_compress_ctx_add_page(&cc, folio); goto next_page; read_single_page: #endif ret = f2fs_read_single_page(inode, folio, max_nr_pages, &map, &bio, &last_block_in_bio, rac); if (ret) { #ifdef CONFIG_F2FS_FS_COMPRESSION set_error_page: #endif folio_zero_segment(folio, 0, folio_size(folio)); folio_unlock(folio); } #ifdef CONFIG_F2FS_FS_COMPRESSION next_page: #endif #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { /* last page */ if (nr_pages == 1 && !f2fs_cluster_is_empty(&cc)) { ret = f2fs_read_multi_pages(&cc, &bio, max_nr_pages, &last_block_in_bio, rac, false); f2fs_destroy_compress_ctx(&cc, false); } } #endif } if (bio) f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); return ret; } static int f2fs_read_data_folio(struct file *file, struct folio *folio) { struct inode *inode = folio_file_mapping(folio)->host; int ret = -EAGAIN; trace_f2fs_readpage(folio, DATA); if (!f2fs_is_compress_backend_ready(inode)) { folio_unlock(folio); return -EOPNOTSUPP; } /* If the file has inline data, try to read it directly */ if (f2fs_has_inline_data(inode)) ret = f2fs_read_inline_data(inode, folio); if (ret == -EAGAIN) ret = f2fs_mpage_readpages(inode, NULL, folio); return ret; } static void f2fs_readahead(struct readahead_control *rac) { struct inode *inode = rac->mapping->host; trace_f2fs_readpages(inode, readahead_index(rac), readahead_count(rac)); if (!f2fs_is_compress_backend_ready(inode)) return; /* If the file has inline data, skip readahead */ if (f2fs_has_inline_data(inode)) return; f2fs_mpage_readpages(inode, rac, NULL); } int f2fs_encrypt_one_page(struct f2fs_io_info *fio) { struct inode *inode = fio->page->mapping->host; struct page *mpage, *page; gfp_t gfp_flags = GFP_NOFS; if (!f2fs_encrypted_file(inode)) return 0; page = fio->compressed_page ? fio->compressed_page : fio->page; if (fscrypt_inode_uses_inline_crypto(inode)) return 0; retry_encrypt: fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(page, PAGE_SIZE, 0, gfp_flags); if (IS_ERR(fio->encrypted_page)) { /* flush pending IOs and wait for a while in the ENOMEM case */ if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { f2fs_flush_merged_writes(fio->sbi); memalloc_retry_wait(GFP_NOFS); gfp_flags |= __GFP_NOFAIL; goto retry_encrypt; } return PTR_ERR(fio->encrypted_page); } mpage = find_lock_page(META_MAPPING(fio->sbi), fio->old_blkaddr); if (mpage) { if (PageUptodate(mpage)) memcpy(page_address(mpage), page_address(fio->encrypted_page), PAGE_SIZE); f2fs_put_page(mpage, 1); } return 0; } static inline bool check_inplace_update_policy(struct inode *inode, struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); if (IS_F2FS_IPU_HONOR_OPU_WRITE(sbi) && is_inode_flag_set(inode, FI_OPU_WRITE)) return false; if (IS_F2FS_IPU_FORCE(sbi)) return true; if (IS_F2FS_IPU_SSR(sbi) && f2fs_need_SSR(sbi)) return true; if (IS_F2FS_IPU_UTIL(sbi) && utilization(sbi) > SM_I(sbi)->min_ipu_util) return true; if (IS_F2FS_IPU_SSR_UTIL(sbi) && f2fs_need_SSR(sbi) && utilization(sbi) > SM_I(sbi)->min_ipu_util) return true; /* * IPU for rewrite async pages */ if (IS_F2FS_IPU_ASYNC(sbi) && fio && fio->op == REQ_OP_WRITE && !(fio->op_flags & REQ_SYNC) && !IS_ENCRYPTED(inode)) return true; /* this is only set during fdatasync */ if (IS_F2FS_IPU_FSYNC(sbi) && is_inode_flag_set(inode, FI_NEED_IPU)) return true; if (unlikely(fio && is_sbi_flag_set(sbi, SBI_CP_DISABLED) && !f2fs_is_checkpointed_data(sbi, fio->old_blkaddr))) return true; return false; } bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio) { /* swap file is migrating in aligned write mode */ if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) return false; if (f2fs_is_pinned_file(inode)) return true; /* if this is cold file, we should overwrite to avoid fragmentation */ if (file_is_cold(inode) && !is_inode_flag_set(inode, FI_OPU_WRITE)) return true; return check_inplace_update_policy(inode, fio); } bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); /* The below cases were checked when setting it. */ if (f2fs_is_pinned_file(inode)) return false; if (fio && is_sbi_flag_set(sbi, SBI_NEED_FSCK)) return true; if (f2fs_lfs_mode(sbi)) return true; if (S_ISDIR(inode->i_mode)) return true; if (IS_NOQUOTA(inode)) return true; if (f2fs_used_in_atomic_write(inode)) return true; /* rewrite low ratio compress data w/ OPU mode to avoid fragmentation */ if (f2fs_compressed_file(inode) && F2FS_OPTION(sbi).compress_mode == COMPR_MODE_USER && is_inode_flag_set(inode, FI_ENABLE_COMPRESS)) return true; /* swap file is migrating in aligned write mode */ if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) return true; if (is_inode_flag_set(inode, FI_OPU_WRITE)) return true; if (fio) { if (page_private_gcing(fio->page)) return true; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) && f2fs_is_checkpointed_data(sbi, fio->old_blkaddr))) return true; } return false; } static inline bool need_inplace_update(struct f2fs_io_info *fio) { struct inode *inode = fio->page->mapping->host; if (f2fs_should_update_outplace(inode, fio)) return false; return f2fs_should_update_inplace(inode, fio); } int f2fs_do_write_data_page(struct f2fs_io_info *fio) { struct folio *folio = page_folio(fio->page); struct inode *inode = folio->mapping->host; struct dnode_of_data dn; struct node_info ni; bool ipu_force = false; bool atomic_commit; int err = 0; /* Use COW inode to make dnode_of_data for atomic write */ atomic_commit = f2fs_is_atomic_file(inode) && page_private_atomic(folio_page(folio, 0)); if (atomic_commit) set_new_dnode(&dn, F2FS_I(inode)->cow_inode, NULL, NULL, 0); else set_new_dnode(&dn, inode, NULL, NULL, 0); if (need_inplace_update(fio) && f2fs_lookup_read_extent_cache_block(inode, folio->index, &fio->old_blkaddr)) { if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, DATA_GENERIC_ENHANCE)) return -EFSCORRUPTED; ipu_force = true; fio->need_lock = LOCK_DONE; goto got_it; } /* Deadlock due to between page->lock and f2fs_lock_op */ if (fio->need_lock == LOCK_REQ && !f2fs_trylock_op(fio->sbi)) return -EAGAIN; err = f2fs_get_dnode_of_data(&dn, folio->index, LOOKUP_NODE); if (err) goto out; fio->old_blkaddr = dn.data_blkaddr; /* This page is already truncated */ if (fio->old_blkaddr == NULL_ADDR) { folio_clear_uptodate(folio); clear_page_private_gcing(folio_page(folio, 0)); goto out_writepage; } got_it: if (__is_valid_data_blkaddr(fio->old_blkaddr) && !f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; goto out_writepage; } /* wait for GCed page writeback via META_MAPPING */ if (fio->meta_gc) f2fs_wait_on_block_writeback(inode, fio->old_blkaddr); /* * If current allocation needs SSR, * it had better in-place writes for updated data. */ if (ipu_force || (__is_valid_data_blkaddr(fio->old_blkaddr) && need_inplace_update(fio))) { err = f2fs_encrypt_one_page(fio); if (err) goto out_writepage; folio_start_writeback(folio); f2fs_put_dnode(&dn); if (fio->need_lock == LOCK_REQ) f2fs_unlock_op(fio->sbi); err = f2fs_inplace_write_data(fio); if (err) { if (fscrypt_inode_uses_fs_layer_crypto(inode)) fscrypt_finalize_bounce_page(&fio->encrypted_page); folio_end_writeback(folio); } else { set_inode_flag(inode, FI_UPDATE_WRITE); } trace_f2fs_do_write_data_page(folio, IPU); return err; } if (fio->need_lock == LOCK_RETRY) { if (!f2fs_trylock_op(fio->sbi)) { err = -EAGAIN; goto out_writepage; } fio->need_lock = LOCK_REQ; } err = f2fs_get_node_info(fio->sbi, dn.nid, &ni, false); if (err) goto out_writepage; fio->version = ni.version; err = f2fs_encrypt_one_page(fio); if (err) goto out_writepage; folio_start_writeback(folio); if (fio->compr_blocks && fio->old_blkaddr == COMPRESS_ADDR) f2fs_i_compr_blocks_update(inode, fio->compr_blocks - 1, false); /* LFS mode write path */ f2fs_outplace_write_data(&dn, fio); trace_f2fs_do_write_data_page(folio, OPU); set_inode_flag(inode, FI_APPEND_WRITE); if (atomic_commit) clear_page_private_atomic(folio_page(folio, 0)); out_writepage: f2fs_put_dnode(&dn); out: if (fio->need_lock == LOCK_REQ) f2fs_unlock_op(fio->sbi); return err; } int f2fs_write_single_data_page(struct folio *folio, int *submitted, struct bio **bio, sector_t *last_block, struct writeback_control *wbc, enum iostat_type io_type, int compr_blocks, bool allow_balance) { struct inode *inode = folio->mapping->host; struct page *page = folio_page(folio, 0); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); loff_t i_size = i_size_read(inode); const pgoff_t end_index = ((unsigned long long)i_size) >> PAGE_SHIFT; loff_t psize = (loff_t)(folio->index + 1) << PAGE_SHIFT; unsigned offset = 0; bool need_balance_fs = false; bool quota_inode = IS_NOQUOTA(inode); int err = 0; struct f2fs_io_info fio = { .sbi = sbi, .ino = inode->i_ino, .type = DATA, .op = REQ_OP_WRITE, .op_flags = wbc_to_write_flags(wbc), .old_blkaddr = NULL_ADDR, .page = page, .encrypted_page = NULL, .submitted = 0, .compr_blocks = compr_blocks, .need_lock = compr_blocks ? LOCK_DONE : LOCK_RETRY, .meta_gc = f2fs_meta_inode_gc_required(inode) ? 1 : 0, .io_type = io_type, .io_wbc = wbc, .bio = bio, .last_block = last_block, }; trace_f2fs_writepage(folio, DATA); /* we should bypass data pages to proceed the kworker jobs */ if (unlikely(f2fs_cp_error(sbi))) { mapping_set_error(folio->mapping, -EIO); /* * don't drop any dirty dentry pages for keeping lastest * directory structure. */ if (S_ISDIR(inode->i_mode) && !is_sbi_flag_set(sbi, SBI_IS_CLOSE)) goto redirty_out; /* keep data pages in remount-ro mode */ if (F2FS_OPTION(sbi).errors == MOUNT_ERRORS_READONLY) goto redirty_out; goto out; } if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; if (folio->index < end_index || f2fs_verity_in_progress(inode) || compr_blocks) goto write; /* * If the offset is out-of-range of file size, * this page does not have to be written to disk. */ offset = i_size & (PAGE_SIZE - 1); if ((folio->index >= end_index + 1) || !offset) goto out; folio_zero_segment(folio, offset, folio_size(folio)); write: /* Dentry/quota blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode) || quota_inode) { /* * We need to wait for node_write to avoid block allocation during * checkpoint. This can only happen to quota writes which can cause * the below discard race condition. */ if (quota_inode) f2fs_down_read(&sbi->node_write); fio.need_lock = LOCK_DONE; err = f2fs_do_write_data_page(&fio); if (quota_inode) f2fs_up_read(&sbi->node_write); goto done; } if (!wbc->for_reclaim) need_balance_fs = true; else if (has_not_enough_free_secs(sbi, 0, 0)) goto redirty_out; else set_inode_flag(inode, FI_HOT_DATA); err = -EAGAIN; if (f2fs_has_inline_data(inode)) { err = f2fs_write_inline_data(inode, folio); if (!err) goto out; } if (err == -EAGAIN) { err = f2fs_do_write_data_page(&fio); if (err == -EAGAIN) { f2fs_bug_on(sbi, compr_blocks); fio.need_lock = LOCK_REQ; err = f2fs_do_write_data_page(&fio); } } if (err) { file_set_keep_isize(inode); } else { spin_lock(&F2FS_I(inode)->i_size_lock); if (F2FS_I(inode)->last_disk_size < psize) F2FS_I(inode)->last_disk_size = psize; spin_unlock(&F2FS_I(inode)->i_size_lock); } done: if (err && err != -ENOENT) goto redirty_out; out: inode_dec_dirty_pages(inode); if (err) { folio_clear_uptodate(folio); clear_page_private_gcing(page); } if (wbc->for_reclaim) { f2fs_submit_merged_write_cond(sbi, NULL, page, 0, DATA); clear_inode_flag(inode, FI_HOT_DATA); f2fs_remove_dirty_inode(inode); submitted = NULL; } folio_unlock(folio); if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) && !F2FS_I(inode)->wb_task && allow_balance) f2fs_balance_fs(sbi, need_balance_fs); if (unlikely(f2fs_cp_error(sbi))) { f2fs_submit_merged_write(sbi, DATA); if (bio && *bio) f2fs_submit_merged_ipu_write(sbi, bio, NULL); submitted = NULL; } if (submitted) *submitted = fio.submitted; return 0; redirty_out: folio_redirty_for_writepage(wbc, folio); /* * pageout() in MM translates EAGAIN, so calls handle_write_error() * -> mapping_set_error() -> set_bit(AS_EIO, ...). * file_write_and_wait_range() will see EIO error, which is critical * to return value of fsync() followed by atomic_write failure to user. */ if (!err || wbc->for_reclaim) return AOP_WRITEPAGE_ACTIVATE; folio_unlock(folio); return err; } static int f2fs_write_data_page(struct page *page, struct writeback_control *wbc) { struct folio *folio = page_folio(page); #ifdef CONFIG_F2FS_FS_COMPRESSION struct inode *inode = folio->mapping->host; if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) goto out; if (f2fs_compressed_file(inode)) { if (f2fs_is_compressed_cluster(inode, folio->index)) { folio_redirty_for_writepage(wbc, folio); return AOP_WRITEPAGE_ACTIVATE; } } out: #endif return f2fs_write_single_data_page(folio, NULL, NULL, NULL, wbc, FS_DATA_IO, 0, true); } /* * This function was copied from write_cache_pages from mm/page-writeback.c. * The major change is making write step of cold data page separately from * warm/hot data page. */ static int f2fs_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, enum iostat_type io_type) { int ret = 0; int done = 0, retry = 0; struct page *pages_local[F2FS_ONSTACK_PAGES]; struct page **pages = pages_local; struct folio_batch fbatch; struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); struct bio *bio = NULL; sector_t last_block; #ifdef CONFIG_F2FS_FS_COMPRESSION struct inode *inode = mapping->host; struct compress_ctx cc = { .inode = inode, .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, .cluster_size = F2FS_I(inode)->i_cluster_size, .cluster_idx = NULL_CLUSTER, .rpages = NULL, .nr_rpages = 0, .cpages = NULL, .valid_nr_cpages = 0, .rbuf = NULL, .cbuf = NULL, .rlen = PAGE_SIZE * F2FS_I(inode)->i_cluster_size, .private = NULL, }; #endif int nr_folios, p, idx; int nr_pages; unsigned int max_pages = F2FS_ONSTACK_PAGES; pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; int range_whole = 0; xa_mark_t tag; int nwritten = 0; int submitted = 0; int i; #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode) && 1 << cc.log_cluster_size > F2FS_ONSTACK_PAGES) { pages = f2fs_kzalloc(sbi, sizeof(struct page *) << cc.log_cluster_size, GFP_NOFS | __GFP_NOFAIL); max_pages = 1 << cc.log_cluster_size; } #endif folio_batch_init(&fbatch); if (get_dirty_pages(mapping->host) <= SM_I(F2FS_M_SB(mapping))->min_hot_blocks) set_inode_flag(mapping->host, FI_HOT_DATA); else clear_inode_flag(mapping->host, FI_HOT_DATA); if (wbc->range_cyclic) { index = mapping->writeback_index; /* prev offset */ end = -1; } else { index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; } if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; retry: retry = 0; if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && !retry && (index <= end)) { nr_pages = 0; again: nr_folios = filemap_get_folios_tag(mapping, &index, end, tag, &fbatch); if (nr_folios == 0) { if (nr_pages) goto write; break; } for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; idx = 0; p = folio_nr_pages(folio); add_more: pages[nr_pages] = folio_page(folio, idx); folio_get(folio); if (++nr_pages == max_pages) { index = folio->index + idx + 1; folio_batch_release(&fbatch); goto write; } if (++idx < p) goto add_more; } folio_batch_release(&fbatch); goto again; write: for (i = 0; i < nr_pages; i++) { struct page *page = pages[i]; struct folio *folio = page_folio(page); bool need_readd; readd: need_readd = false; #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { void *fsdata = NULL; struct page *pagep; int ret2; ret = f2fs_init_compress_ctx(&cc); if (ret) { done = 1; break; } if (!f2fs_cluster_can_merge_page(&cc, folio->index)) { ret = f2fs_write_multi_pages(&cc, &submitted, wbc, io_type); if (!ret) need_readd = true; goto result; } if (unlikely(f2fs_cp_error(sbi))) goto lock_folio; if (!f2fs_cluster_is_empty(&cc)) goto lock_folio; if (f2fs_all_cluster_page_ready(&cc, pages, i, nr_pages, true)) goto lock_folio; ret2 = f2fs_prepare_compress_overwrite( inode, &pagep, folio->index, &fsdata); if (ret2 < 0) { ret = ret2; done = 1; break; } else if (ret2 && (!f2fs_compress_write_end(inode, fsdata, folio->index, 1) || !f2fs_all_cluster_page_ready(&cc, pages, i, nr_pages, false))) { retry = 1; break; } } #endif /* give a priority to WB_SYNC threads */ if (atomic_read(&sbi->wb_sync_req[DATA]) && wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; } #ifdef CONFIG_F2FS_FS_COMPRESSION lock_folio: #endif done_index = folio->index; retry_write: folio_lock(folio); if (unlikely(folio->mapping != mapping)) { continue_unlock: folio_unlock(folio); continue; } if (!folio_test_dirty(folio)) { /* someone wrote it for us */ goto continue_unlock; } if (folio_test_writeback(folio)) { if (wbc->sync_mode == WB_SYNC_NONE) goto continue_unlock; f2fs_wait_on_page_writeback(&folio->page, DATA, true, true); } if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { folio_get(folio); f2fs_compress_ctx_add_page(&cc, folio); continue; } #endif ret = f2fs_write_single_data_page(folio, &submitted, &bio, &last_block, wbc, io_type, 0, true); if (ret == AOP_WRITEPAGE_ACTIVATE) folio_unlock(folio); #ifdef CONFIG_F2FS_FS_COMPRESSION result: #endif nwritten += submitted; wbc->nr_to_write -= submitted; if (unlikely(ret)) { /* * keep nr_to_write, since vfs uses this to * get # of written pages. */ if (ret == AOP_WRITEPAGE_ACTIVATE) { ret = 0; goto next; } else if (ret == -EAGAIN) { ret = 0; if (wbc->sync_mode == WB_SYNC_ALL) { f2fs_io_schedule_timeout( DEFAULT_IO_TIMEOUT); goto retry_write; } goto next; } done_index = folio_next_index(folio); done = 1; break; } if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; } next: if (need_readd) goto readd; } release_pages(pages, nr_pages); cond_resched(); } #ifdef CONFIG_F2FS_FS_COMPRESSION /* flush remained pages in compress cluster */ if (f2fs_compressed_file(inode) && !f2fs_cluster_is_empty(&cc)) { ret = f2fs_write_multi_pages(&cc, &submitted, wbc, io_type); nwritten += submitted; wbc->nr_to_write -= submitted; if (ret) { done = 1; retry = 0; } } if (f2fs_compressed_file(inode)) f2fs_destroy_compress_ctx(&cc, false); #endif if (retry) { index = 0; end = -1; goto retry; } if (wbc->range_cyclic && !done) done_index = 0; if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; if (nwritten) f2fs_submit_merged_write_cond(F2FS_M_SB(mapping), mapping->host, NULL, 0, DATA); /* submit cached bio of IPU write */ if (bio) f2fs_submit_merged_ipu_write(sbi, &bio, NULL); #ifdef CONFIG_F2FS_FS_COMPRESSION if (pages != pages_local) kfree(pages); #endif return ret; } static inline bool __should_serialize_io(struct inode *inode, struct writeback_control *wbc) { /* to avoid deadlock in path of data flush */ if (F2FS_I(inode)->wb_task) return false; if (!S_ISREG(inode->i_mode)) return false; if (IS_NOQUOTA(inode)) return false; if (f2fs_need_compress_data(inode)) return true; if (wbc->sync_mode != WB_SYNC_ALL) return true; if (get_dirty_pages(inode) >= SM_I(F2FS_I_SB(inode))->min_seq_blocks) return true; return false; } static int __f2fs_write_data_pages(struct address_space *mapping, struct writeback_control *wbc, enum iostat_type io_type) { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct blk_plug plug; int ret; bool locked = false; /* deal with chardevs and other special file */ if (!mapping->a_ops->writepage) return 0; /* skip writing if there is no dirty page in this inode */ if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE) return 0; /* during POR, we don't need to trigger writepage at all. */ if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto skip_write; if ((S_ISDIR(inode->i_mode) || IS_NOQUOTA(inode)) && wbc->sync_mode == WB_SYNC_NONE && get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) && f2fs_available_free_memory(sbi, DIRTY_DENTS)) goto skip_write; /* skip writing in file defragment preparing stage */ if (is_inode_flag_set(inode, FI_SKIP_WRITES)) goto skip_write; trace_f2fs_writepages(mapping->host, wbc, DATA); /* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */ if (wbc->sync_mode == WB_SYNC_ALL) atomic_inc(&sbi->wb_sync_req[DATA]); else if (atomic_read(&sbi->wb_sync_req[DATA])) { /* to avoid potential deadlock */ if (current->plug) blk_finish_plug(current->plug); goto skip_write; } if (__should_serialize_io(inode, wbc)) { mutex_lock(&sbi->writepages); locked = true; } blk_start_plug(&plug); ret = f2fs_write_cache_pages(mapping, wbc, io_type); blk_finish_plug(&plug); if (locked) mutex_unlock(&sbi->writepages); if (wbc->sync_mode == WB_SYNC_ALL) atomic_dec(&sbi->wb_sync_req[DATA]); /* * if some pages were truncated, we cannot guarantee its mapping->host * to detect pending bios. */ f2fs_remove_dirty_inode(inode); return ret; skip_write: wbc->pages_skipped += get_dirty_pages(inode); trace_f2fs_writepages(mapping->host, wbc, DATA); return 0; } static int f2fs_write_data_pages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; return __f2fs_write_data_pages(mapping, wbc, F2FS_I(inode)->cp_task == current ? FS_CP_DATA_IO : FS_DATA_IO); } void f2fs_write_failed(struct inode *inode, loff_t to) { loff_t i_size = i_size_read(inode); if (IS_NOQUOTA(inode)) return; /* In the fs-verity case, f2fs_end_enable_verity() does the truncate */ if (to > i_size && !f2fs_verity_in_progress(inode)) { f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); truncate_pagecache(inode, i_size); f2fs_truncate_blocks(inode, i_size, true); filemap_invalidate_unlock(inode->i_mapping); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } } static int prepare_write_begin(struct f2fs_sb_info *sbi, struct folio *folio, loff_t pos, unsigned int len, block_t *blk_addr, bool *node_changed) { struct inode *inode = folio->mapping->host; pgoff_t index = folio->index; struct dnode_of_data dn; struct page *ipage; bool locked = false; int flag = F2FS_GET_BLOCK_PRE_AIO; int err = 0; /* * If a whole page is being written and we already preallocated all the * blocks, then there is no need to get a block address now. */ if (len == PAGE_SIZE && is_inode_flag_set(inode, FI_PREALLOCATED_ALL)) return 0; /* f2fs_lock_op avoids race between write CP and convert_inline_page */ if (f2fs_has_inline_data(inode)) { if (pos + len > MAX_INLINE_DATA(inode)) flag = F2FS_GET_BLOCK_DEFAULT; f2fs_map_lock(sbi, flag); locked = true; } else if ((pos & PAGE_MASK) >= i_size_read(inode)) { f2fs_map_lock(sbi, flag); locked = true; } restart: /* check inline_data */ ipage = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto unlock_out; } set_new_dnode(&dn, inode, ipage, ipage, 0); if (f2fs_has_inline_data(inode)) { if (pos + len <= MAX_INLINE_DATA(inode)) { f2fs_do_read_inline_data(folio, ipage); set_inode_flag(inode, FI_DATA_EXIST); if (inode->i_nlink) set_page_private_inline(ipage); goto out; } err = f2fs_convert_inline_page(&dn, folio_page(folio, 0)); if (err || dn.data_blkaddr != NULL_ADDR) goto out; } if (!f2fs_lookup_read_extent_cache_block(inode, index, &dn.data_blkaddr)) { if (IS_DEVICE_ALIASING(inode)) { err = -ENODATA; goto out; } if (locked) { err = f2fs_reserve_block(&dn, index); goto out; } /* hole case */ err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); if (!err && dn.data_blkaddr != NULL_ADDR) goto out; f2fs_put_dnode(&dn); f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO); WARN_ON(flag != F2FS_GET_BLOCK_PRE_AIO); locked = true; goto restart; } out: if (!err) { /* convert_inline_page can make node_changed */ *blk_addr = dn.data_blkaddr; *node_changed = dn.node_changed; } f2fs_put_dnode(&dn); unlock_out: if (locked) f2fs_map_unlock(sbi, flag); return err; } static int __find_data_block(struct inode *inode, pgoff_t index, block_t *blk_addr) { struct dnode_of_data dn; struct page *ipage; int err = 0; ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); set_new_dnode(&dn, inode, ipage, ipage, 0); if (!f2fs_lookup_read_extent_cache_block(inode, index, &dn.data_blkaddr)) { /* hole case */ err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err) { dn.data_blkaddr = NULL_ADDR; err = 0; } } *blk_addr = dn.data_blkaddr; f2fs_put_dnode(&dn); return err; } static int __reserve_data_block(struct inode *inode, pgoff_t index, block_t *blk_addr, bool *node_changed) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; struct page *ipage; int err = 0; f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO); ipage = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto unlock_out; } set_new_dnode(&dn, inode, ipage, ipage, 0); if (!f2fs_lookup_read_extent_cache_block(dn.inode, index, &dn.data_blkaddr)) err = f2fs_reserve_block(&dn, index); *blk_addr = dn.data_blkaddr; *node_changed = dn.node_changed; f2fs_put_dnode(&dn); unlock_out: f2fs_map_unlock(sbi, F2FS_GET_BLOCK_PRE_AIO); return err; } static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi, struct folio *folio, loff_t pos, unsigned int len, block_t *blk_addr, bool *node_changed, bool *use_cow) { struct inode *inode = folio->mapping->host; struct inode *cow_inode = F2FS_I(inode)->cow_inode; pgoff_t index = folio->index; int err = 0; block_t ori_blk_addr = NULL_ADDR; /* If pos is beyond the end of file, reserve a new block in COW inode */ if ((pos & PAGE_MASK) >= i_size_read(inode)) goto reserve_block; /* Look for the block in COW inode first */ err = __find_data_block(cow_inode, index, blk_addr); if (err) { return err; } else if (*blk_addr != NULL_ADDR) { *use_cow = true; return 0; } if (is_inode_flag_set(inode, FI_ATOMIC_REPLACE)) goto reserve_block; /* Look for the block in the original inode */ err = __find_data_block(inode, index, &ori_blk_addr); if (err) return err; reserve_block: /* Finally, we should reserve a new block in COW inode for the update */ err = __reserve_data_block(cow_inode, index, blk_addr, node_changed); if (err) return err; inc_atomic_write_cnt(inode); if (ori_blk_addr != NULL_ADDR) *blk_addr = ori_blk_addr; return 0; } static int f2fs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct folio *folio; pgoff_t index = pos >> PAGE_SHIFT; bool need_balance = false; bool use_cow = false; block_t blkaddr = NULL_ADDR; int err = 0; trace_f2fs_write_begin(inode, pos, len); if (!f2fs_is_checkpoint_ready(sbi)) { err = -ENOSPC; goto fail; } /* * We should check this at this moment to avoid deadlock on inode page * and #0 page. The locking rule for inline_data conversion should be: * folio_lock(folio #0) -> folio_lock(inode_page) */ if (index != 0) { err = f2fs_convert_inline_inode(inode); if (err) goto fail; } #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { int ret; struct page *page; *fsdata = NULL; if (len == PAGE_SIZE && !(f2fs_is_atomic_file(inode))) goto repeat; ret = f2fs_prepare_compress_overwrite(inode, &page, index, fsdata); if (ret < 0) { err = ret; goto fail; } else if (ret) { *foliop = page_folio(page); return 0; } } #endif repeat: /* * Do not use FGP_STABLE to avoid deadlock. * Will wait that below with our IO control. */ folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_WRITE | FGP_CREAT, GFP_NOFS); if (IS_ERR(folio)) { err = PTR_ERR(folio); goto fail; } /* TODO: cluster can be compressed due to race with .writepage */ *foliop = folio; if (f2fs_is_atomic_file(inode)) err = prepare_atomic_write_begin(sbi, folio, pos, len, &blkaddr, &need_balance, &use_cow); else err = prepare_write_begin(sbi, folio, pos, len, &blkaddr, &need_balance); if (err) goto put_folio; if (need_balance && !IS_NOQUOTA(inode) && has_not_enough_free_secs(sbi, 0, 0)) { folio_unlock(folio); f2fs_balance_fs(sbi, true); folio_lock(folio); if (folio->mapping != mapping) { /* The folio got truncated from under us */ folio_unlock(folio); folio_put(folio); goto repeat; } } f2fs_wait_on_page_writeback(&folio->page, DATA, false, true); if (len == folio_size(folio) || folio_test_uptodate(folio)) return 0; if (!(pos & (PAGE_SIZE - 1)) && (pos + len) >= i_size_read(inode) && !f2fs_verity_in_progress(inode)) { folio_zero_segment(folio, len, folio_size(folio)); return 0; } if (blkaddr == NEW_ADDR) { folio_zero_segment(folio, 0, folio_size(folio)); folio_mark_uptodate(folio); } else { if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ)) { err = -EFSCORRUPTED; goto put_folio; } err = f2fs_submit_page_read(use_cow ? F2FS_I(inode)->cow_inode : inode, folio, blkaddr, 0, true); if (err) goto put_folio; folio_lock(folio); if (unlikely(folio->mapping != mapping)) { folio_unlock(folio); folio_put(folio); goto repeat; } if (unlikely(!folio_test_uptodate(folio))) { err = -EIO; goto put_folio; } } return 0; put_folio: folio_unlock(folio); folio_put(folio); fail: f2fs_write_failed(inode, pos + len); return err; } static int f2fs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { struct inode *inode = folio->mapping->host; trace_f2fs_write_end(inode, pos, len, copied); /* * This should be come from len == PAGE_SIZE, and we expect copied * should be PAGE_SIZE. Otherwise, we treat it with zero copied and * let generic_perform_write() try to copy data again through copied=0. */ if (!folio_test_uptodate(folio)) { if (unlikely(copied != len)) copied = 0; else folio_mark_uptodate(folio); } #ifdef CONFIG_F2FS_FS_COMPRESSION /* overwrite compressed file */ if (f2fs_compressed_file(inode) && fsdata) { f2fs_compress_write_end(inode, fsdata, folio->index, copied); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); if (pos + copied > i_size_read(inode) && !f2fs_verity_in_progress(inode)) f2fs_i_size_write(inode, pos + copied); return copied; } #endif if (!copied) goto unlock_out; folio_mark_dirty(folio); if (f2fs_is_atomic_file(inode)) set_page_private_atomic(folio_page(folio, 0)); if (pos + copied > i_size_read(inode) && !f2fs_verity_in_progress(inode)) { f2fs_i_size_write(inode, pos + copied); if (f2fs_is_atomic_file(inode)) f2fs_i_size_write(F2FS_I(inode)->cow_inode, pos + copied); } unlock_out: folio_unlock(folio); folio_put(folio); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return copied; } void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length) { struct inode *inode = folio->mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); if (inode->i_ino >= F2FS_ROOT_INO(sbi) && (offset || length != folio_size(folio))) return; if (folio_test_dirty(folio)) { if (inode->i_ino == F2FS_META_INO(sbi)) { dec_page_count(sbi, F2FS_DIRTY_META); } else if (inode->i_ino == F2FS_NODE_INO(sbi)) { dec_page_count(sbi, F2FS_DIRTY_NODES); } else { inode_dec_dirty_pages(inode); f2fs_remove_dirty_inode(inode); } } clear_page_private_all(&folio->page); } bool f2fs_release_folio(struct folio *folio, gfp_t wait) { /* If this is dirty folio, keep private data */ if (folio_test_dirty(folio)) return false; clear_page_private_all(&folio->page); return true; } static bool f2fs_dirty_data_folio(struct address_space *mapping, struct folio *folio) { struct inode *inode = mapping->host; trace_f2fs_set_page_dirty(folio, DATA); if (!folio_test_uptodate(folio)) folio_mark_uptodate(folio); BUG_ON(folio_test_swapcache(folio)); if (filemap_dirty_folio(mapping, folio)) { f2fs_update_dirty_folio(inode, folio); return true; } return false; } static sector_t f2fs_bmap_compress(struct inode *inode, sector_t block) { #ifdef CONFIG_F2FS_FS_COMPRESSION struct dnode_of_data dn; sector_t start_idx, blknr = 0; int ret; start_idx = round_down(block, F2FS_I(inode)->i_cluster_size); set_new_dnode(&dn, inode, NULL, NULL, 0); ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); if (ret) return 0; if (dn.data_blkaddr != COMPRESS_ADDR) { dn.ofs_in_node += block - start_idx; blknr = f2fs_data_blkaddr(&dn); if (!__is_valid_data_blkaddr(blknr)) blknr = 0; } f2fs_put_dnode(&dn); return blknr; #else return 0; #endif } static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; sector_t blknr = 0; if (f2fs_has_inline_data(inode)) goto out; /* make sure allocating whole blocks */ if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) filemap_write_and_wait(mapping); /* Block number less than F2FS MAX BLOCKS */ if (unlikely(block >= max_file_blocks(inode))) goto out; if (f2fs_compressed_file(inode)) { blknr = f2fs_bmap_compress(inode, block); } else { struct f2fs_map_blocks map; memset(&map, 0, sizeof(map)); map.m_lblk = block; map.m_len = 1; map.m_next_pgofs = NULL; map.m_seg_type = NO_CHECK_TYPE; if (!f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_BMAP)) blknr = map.m_pblk; } out: trace_f2fs_bmap(inode, block, blknr); return blknr; } #ifdef CONFIG_SWAP static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk, unsigned int blkcnt) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int blkofs; unsigned int blk_per_sec = BLKS_PER_SEC(sbi); unsigned int end_blk = start_blk + blkcnt - 1; unsigned int secidx = start_blk / blk_per_sec; unsigned int end_sec; int ret = 0; if (!blkcnt) return 0; end_sec = end_blk / blk_per_sec; f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); set_inode_flag(inode, FI_ALIGNED_WRITE); set_inode_flag(inode, FI_OPU_WRITE); for (; secidx <= end_sec; secidx++) { unsigned int blkofs_end = secidx == end_sec ? end_blk % blk_per_sec : blk_per_sec - 1; f2fs_down_write(&sbi->pin_sem); ret = f2fs_allocate_pinning_section(sbi); if (ret) { f2fs_up_write(&sbi->pin_sem); break; } set_inode_flag(inode, FI_SKIP_WRITES); for (blkofs = 0; blkofs <= blkofs_end; blkofs++) { struct page *page; unsigned int blkidx = secidx * blk_per_sec + blkofs; page = f2fs_get_lock_data_page(inode, blkidx, true); if (IS_ERR(page)) { f2fs_up_write(&sbi->pin_sem); ret = PTR_ERR(page); goto done; } set_page_dirty(page); f2fs_put_page(page, 1); } clear_inode_flag(inode, FI_SKIP_WRITES); ret = filemap_fdatawrite(inode->i_mapping); f2fs_up_write(&sbi->pin_sem); if (ret) break; } done: clear_inode_flag(inode, FI_SKIP_WRITES); clear_inode_flag(inode, FI_OPU_WRITE); clear_inode_flag(inode, FI_ALIGNED_WRITE); filemap_invalidate_unlock(inode->i_mapping); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } static int check_swap_activate(struct swap_info_struct *sis, struct file *swap_file, sector_t *span) { struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); block_t cur_lblock; block_t last_lblock; block_t pblock; block_t lowest_pblock = -1; block_t highest_pblock = 0; int nr_extents = 0; unsigned int nr_pblocks; unsigned int blks_per_sec = BLKS_PER_SEC(sbi); unsigned int not_aligned = 0; int ret = 0; /* * Map all the blocks into the extent list. This code doesn't try * to be very smart. */ cur_lblock = 0; last_lblock = F2FS_BYTES_TO_BLK(i_size_read(inode)); while (cur_lblock < last_lblock && cur_lblock < sis->max) { struct f2fs_map_blocks map; retry: cond_resched(); memset(&map, 0, sizeof(map)); map.m_lblk = cur_lblock; map.m_len = last_lblock - cur_lblock; map.m_next_pgofs = NULL; map.m_next_extent = NULL; map.m_seg_type = NO_CHECK_TYPE; map.m_may_create = false; ret = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_FIEMAP); if (ret) goto out; /* hole */ if (!(map.m_flags & F2FS_MAP_FLAGS)) { f2fs_err(sbi, "Swapfile has holes"); ret = -EINVAL; goto out; } pblock = map.m_pblk; nr_pblocks = map.m_len; if ((pblock - SM_I(sbi)->main_blkaddr) % blks_per_sec || nr_pblocks % blks_per_sec || !f2fs_valid_pinned_area(sbi, pblock)) { bool last_extent = false; not_aligned++; nr_pblocks = roundup(nr_pblocks, blks_per_sec); if (cur_lblock + nr_pblocks > sis->max) nr_pblocks -= blks_per_sec; /* this extent is last one */ if (!nr_pblocks) { nr_pblocks = last_lblock - cur_lblock; last_extent = true; } ret = f2fs_migrate_blocks(inode, cur_lblock, nr_pblocks); if (ret) { if (ret == -ENOENT) ret = -EINVAL; goto out; } if (!last_extent) goto retry; } if (cur_lblock + nr_pblocks >= sis->max) nr_pblocks = sis->max - cur_lblock; if (cur_lblock) { /* exclude the header page */ if (pblock < lowest_pblock) lowest_pblock = pblock; if (pblock + nr_pblocks - 1 > highest_pblock) highest_pblock = pblock + nr_pblocks - 1; } /* * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks */ ret = add_swap_extent(sis, cur_lblock, nr_pblocks, pblock); if (ret < 0) goto out; nr_extents += ret; cur_lblock += nr_pblocks; } ret = nr_extents; *span = 1 + highest_pblock - lowest_pblock; if (cur_lblock == 0) cur_lblock = 1; /* force Empty message */ sis->max = cur_lblock; sis->pages = cur_lblock - 1; sis->highest_bit = cur_lblock - 1; out: if (not_aligned) f2fs_warn(sbi, "Swapfile (%u) is not align to section: 1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate(%lu * N)", not_aligned, blks_per_sec * F2FS_BLKSIZE); return ret; } static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, sector_t *span) { struct inode *inode = file_inode(file); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int ret; if (!S_ISREG(inode->i_mode)) return -EINVAL; if (f2fs_readonly(sbi->sb)) return -EROFS; if (f2fs_lfs_mode(sbi) && !f2fs_sb_has_blkzoned(sbi)) { f2fs_err(sbi, "Swapfile not supported in LFS mode"); return -EINVAL; } ret = f2fs_convert_inline_inode(inode); if (ret) return ret; if (!f2fs_disable_compressed_file(inode)) return -EINVAL; ret = filemap_fdatawrite(inode->i_mapping); if (ret < 0) return ret; f2fs_precache_extents(inode); ret = check_swap_activate(sis, file, span); if (ret < 0) return ret; stat_inc_swapfile_inode(inode); set_inode_flag(inode, FI_PIN_FILE); f2fs_update_time(sbi, REQ_TIME); return ret; } static void f2fs_swap_deactivate(struct file *file) { struct inode *inode = file_inode(file); stat_dec_swapfile_inode(inode); clear_inode_flag(inode, FI_PIN_FILE); } #else static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, sector_t *span) { return -EOPNOTSUPP; } static void f2fs_swap_deactivate(struct file *file) { } #endif const struct address_space_operations f2fs_dblock_aops = { .read_folio = f2fs_read_data_folio, .readahead = f2fs_readahead, .writepage = f2fs_write_data_page, .writepages = f2fs_write_data_pages, .write_begin = f2fs_write_begin, .write_end = f2fs_write_end, .dirty_folio = f2fs_dirty_data_folio, .migrate_folio = filemap_migrate_folio, .invalidate_folio = f2fs_invalidate_folio, .release_folio = f2fs_release_folio, .bmap = f2fs_bmap, .swap_activate = f2fs_swap_activate, .swap_deactivate = f2fs_swap_deactivate, }; void f2fs_clear_page_cache_dirty_tag(struct folio *folio) { struct address_space *mapping = folio->mapping; unsigned long flags; xa_lock_irqsave(&mapping->i_pages, flags); __xa_clear_mark(&mapping->i_pages, folio->index, PAGECACHE_TAG_DIRTY); xa_unlock_irqrestore(&mapping->i_pages, flags); } int __init f2fs_init_post_read_processing(void) { bio_post_read_ctx_cache = kmem_cache_create("f2fs_bio_post_read_ctx", sizeof(struct bio_post_read_ctx), 0, 0, NULL); if (!bio_post_read_ctx_cache) goto fail; bio_post_read_ctx_pool = mempool_create_slab_pool(NUM_PREALLOC_POST_READ_CTXS, bio_post_read_ctx_cache); if (!bio_post_read_ctx_pool) goto fail_free_cache; return 0; fail_free_cache: kmem_cache_destroy(bio_post_read_ctx_cache); fail: return -ENOMEM; } void f2fs_destroy_post_read_processing(void) { mempool_destroy(bio_post_read_ctx_pool); kmem_cache_destroy(bio_post_read_ctx_cache); } int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi) { if (!f2fs_sb_has_encrypt(sbi) && !f2fs_sb_has_verity(sbi) && !f2fs_sb_has_compression(sbi)) return 0; sbi->post_read_wq = alloc_workqueue("f2fs_post_read_wq", WQ_UNBOUND | WQ_HIGHPRI, num_online_cpus()); return sbi->post_read_wq ? 0 : -ENOMEM; } void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi) { if (sbi->post_read_wq) destroy_workqueue(sbi->post_read_wq); } int __init f2fs_init_bio_entry_cache(void) { bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab", sizeof(struct bio_entry)); return bio_entry_slab ? 0 : -ENOMEM; } void f2fs_destroy_bio_entry_cache(void) { kmem_cache_destroy(bio_entry_slab); } static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { struct f2fs_map_blocks map = {}; pgoff_t next_pgofs = 0; int err; map.m_lblk = F2FS_BYTES_TO_BLK(offset); map.m_len = F2FS_BYTES_TO_BLK(offset + length - 1) - map.m_lblk + 1; map.m_next_pgofs = &next_pgofs; map.m_seg_type = f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode), inode->i_write_hint); if (flags & IOMAP_WRITE) map.m_may_create = true; err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DIO); if (err) return err; iomap->offset = F2FS_BLK_TO_BYTES(map.m_lblk); /* * When inline encryption is enabled, sometimes I/O to an encrypted file * has to be broken up to guarantee DUN contiguity. Handle this by * limiting the length of the mapping returned. */ map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len); /* * We should never see delalloc or compressed extents here based on * prior flushing and checks. */ if (WARN_ON_ONCE(map.m_pblk == COMPRESS_ADDR)) return -EINVAL; if (map.m_flags & F2FS_MAP_MAPPED) { if (WARN_ON_ONCE(map.m_pblk == NEW_ADDR)) return -EINVAL; iomap->length = F2FS_BLK_TO_BYTES(map.m_len); iomap->type = IOMAP_MAPPED; iomap->flags |= IOMAP_F_MERGED; iomap->bdev = map.m_bdev; iomap->addr = F2FS_BLK_TO_BYTES(map.m_pblk); } else { if (flags & IOMAP_WRITE) return -ENOTBLK; if (map.m_pblk == NULL_ADDR) { iomap->length = F2FS_BLK_TO_BYTES(next_pgofs) - iomap->offset; iomap->type = IOMAP_HOLE; } else if (map.m_pblk == NEW_ADDR) { iomap->length = F2FS_BLK_TO_BYTES(map.m_len); iomap->type = IOMAP_UNWRITTEN; } else { f2fs_bug_on(F2FS_I_SB(inode), 1); } iomap->addr = IOMAP_NULL_ADDR; } if (map.m_flags & F2FS_MAP_NEW) iomap->flags |= IOMAP_F_NEW; if ((inode->i_state & I_DIRTY_DATASYNC) || offset + length > i_size_read(inode)) iomap->flags |= IOMAP_F_DIRTY; return 0; } const struct iomap_ops f2fs_iomap_ops = { .iomap_begin = f2fs_iomap_begin, }; |
| 64 64 63 2 2 5 57 231 230 1 3 11 2 204 6 206 219 5 2 3 2 3 1 1 32 402 401 5 27 206 25 206 206 2 2 2 11 2 2 11 11 17 24 4 16 4 16 14 2 4 10 4 6 6 14 14 13 13 2 1 1 2 29 29 6 23 22 1 23 23 23 2 21 7 20 1 21 23 42 3 35 28 10 3 1 1 1 1 1 1 478 12 12 2 6 10 10 10 10 9 544 545 506 90 503 59 496 498 6 339 15 372 12 5 543 514 57 59 503 19 493 20 20 505 25 485 490 6 24 480 469 385 3 476 477 476 15 208 208 483 10 78 1 135 4 522 5 16 4 4 3 3 4 531 2 529 31 253 301 2 5 294 492 12 12 1 1 10 9 10 10 10 10 10 8 9 10 12 12 12 10 10 1 9 10 10 8 2 7 10 2 2 166 166 160 2 162 147 162 162 31 145 162 162 153 154 20 148 148 148 1 1 14 14 2 2 1 11 9 2 11 11 1 10 11 1 10 11 10 11 184 184 184 1 160 6 22 162 4 1 163 16 3 8 10 10 26 162 5 1 3 2 1 1 4 4 4 73 65 7 59 14 73 57 57 8 50 1 7 9 74 3 39 115 71 27 18 110 3 107 21 13 75 15 92 94 106 15 1 14 3 2 9 2 12 12 14 20 1 19 4 16 2 17 4 16 8 47 8 47 61 61 35 26 1 41 33 8 41 41 68 68 1 66 7 1 6 5 55 55 48 7 5 56 65 62 44 5 3 1 3 1 5 1 2 1 4 25 21 5 5 1 2 1 3 19 23 26 1 1 1 23 1 1 12 9 7 2 1 7 1 8 8 9 35 35 1 34 34 26 8 2 32 32 32 1 31 1 31 36 1 36 1 35 36 1 5 5 31 1 3 1 26 2 4 5 10 1 6 6 18 1 25 20 5 13 13 1 7 5 12 11 1 12 15 14 1 14 1 13 7 1 1 1 1 5 4 23 12 11 1 8 17 10 4 3 6 1 51 5 46 18 37 6 6 6 6 6 6 31 2 1 2 27 27 4 8 6 2 2 26 7 19 2 1 3 23 73 73 2 1 1 39 28 1 38 28 43 23 66 41 23 3 63 16 38 11 14 18 13 41 33 21 33 20 37 13 3 18 28 20 34 1 13 4 9 1 18 33 27 1 3 23 41 66 67 67 11 2 8 11 1 10 10 6 6 4 8 2 8 2 1 1 1 6 2 6 2 11 11 11 11 1 10 86 1 85 7 11 73 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/namei.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/namei.c * * Copyright (C) 1991, 1992 Linus Torvalds * * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 * Directory entry file type support and forward compatibility hooks * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 * Hash Tree Directory indexing (c) * Daniel Phillips, 2001 * Hash Tree Directory indexing porting * Christopher Li, 2002 * Hash Tree Directory indexing cleanup * Theodore Ts'o, 2002 */ #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/time.h> #include <linux/fcntl.h> #include <linux/stat.h> #include <linux/string.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/bio.h> #include <linux/iversion.h> #include <linux/unicode.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include <trace/events/ext4.h> /* * define how far ahead to read directories while searching them. */ #define NAMEI_RA_CHUNKS 2 #define NAMEI_RA_BLOCKS 4 #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) static struct buffer_head *ext4_append(handle_t *handle, struct inode *inode, ext4_lblk_t *block) { struct ext4_map_blocks map; struct buffer_head *bh; int err; if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb && ((inode->i_size >> 10) >= EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) return ERR_PTR(-ENOSPC); *block = inode->i_size >> inode->i_sb->s_blocksize_bits; map.m_lblk = *block; map.m_len = 1; /* * We're appending new directory block. Make sure the block is not * allocated yet, otherwise we will end up corrupting the * directory. */ err = ext4_map_blocks(NULL, inode, &map, 0); if (err < 0) return ERR_PTR(err); if (err) { EXT4_ERROR_INODE(inode, "Logical block already allocated"); return ERR_PTR(-EFSCORRUPTED); } bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE); if (IS_ERR(bh)) return bh; inode->i_size += inode->i_sb->s_blocksize; EXT4_I(inode)->i_disksize = inode->i_size; err = ext4_mark_inode_dirty(handle, inode); if (err) goto out; BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (err) goto out; return bh; out: brelse(bh); ext4_std_error(inode->i_sb, err); return ERR_PTR(err); } static int ext4_dx_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent); /* * Hints to ext4_read_dirblock regarding whether we expect a directory * block being read to be an index block, or a block containing * directory entries (and if the latter, whether it was found via a * logical block in an htree index block). This is used to control * what sort of sanity checkinig ext4_read_dirblock() will do on the * directory block read from the storage device. EITHER will means * the caller doesn't know what kind of directory block will be read, * so no specific verification will be done. */ typedef enum { EITHER, INDEX, DIRENT, DIRENT_HTREE } dirblock_type_t; #define ext4_read_dirblock(inode, block, type) \ __ext4_read_dirblock((inode), (block), (type), __func__, __LINE__) static struct buffer_head *__ext4_read_dirblock(struct inode *inode, ext4_lblk_t block, dirblock_type_t type, const char *func, unsigned int line) { struct buffer_head *bh; struct ext4_dir_entry *dirent; int is_dx_block = 0; if (block >= inode->i_size >> inode->i_blkbits) { ext4_error_inode(inode, func, line, block, "Attempting to read directory block (%u) that is past i_size (%llu)", block, inode->i_size); return ERR_PTR(-EFSCORRUPTED); } if (ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_EIO)) bh = ERR_PTR(-EIO); else bh = ext4_bread(NULL, inode, block, 0); if (IS_ERR(bh)) { __ext4_warning(inode->i_sb, func, line, "inode #%lu: lblock %lu: comm %s: " "error %ld reading directory block", inode->i_ino, (unsigned long)block, current->comm, PTR_ERR(bh)); return bh; } /* The first directory block must not be a hole. */ if (!bh && (type == INDEX || type == DIRENT_HTREE || block == 0)) { ext4_error_inode(inode, func, line, block, "Directory hole found for htree %s block %u", (type == INDEX) ? "index" : "leaf", block); return ERR_PTR(-EFSCORRUPTED); } if (!bh) return NULL; dirent = (struct ext4_dir_entry *) bh->b_data; /* Determine whether or not we have an index block */ if (is_dx(inode)) { if (block == 0) is_dx_block = 1; else if (ext4_rec_len_from_disk(dirent->rec_len, inode->i_sb->s_blocksize) == inode->i_sb->s_blocksize) is_dx_block = 1; } if (!is_dx_block && type == INDEX) { ext4_error_inode(inode, func, line, block, "directory leaf block found instead of index block"); brelse(bh); return ERR_PTR(-EFSCORRUPTED); } if (!ext4_has_metadata_csum(inode->i_sb) || buffer_verified(bh)) return bh; /* * An empty leaf block can get mistaken for a index block; for * this reason, we can only check the index checksum when the * caller is sure it should be an index block. */ if (is_dx_block && type == INDEX) { if (ext4_dx_csum_verify(inode, dirent) && !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC)) set_buffer_verified(bh); else { ext4_error_inode_err(inode, func, line, block, EFSBADCRC, "Directory index failed checksum"); brelse(bh); return ERR_PTR(-EFSBADCRC); } } if (!is_dx_block) { if (ext4_dirblock_csum_verify(inode, bh) && !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC)) set_buffer_verified(bh); else { ext4_error_inode_err(inode, func, line, block, EFSBADCRC, "Directory block failed checksum"); brelse(bh); return ERR_PTR(-EFSBADCRC); } } return bh; } #ifdef DX_DEBUG #define dxtrace(command) command #else #define dxtrace(command) #endif struct fake_dirent { __le32 inode; __le16 rec_len; u8 name_len; u8 file_type; }; struct dx_countlimit { __le16 limit; __le16 count; }; struct dx_entry { __le32 hash; __le32 block; }; /* * dx_root_info is laid out so that if it should somehow get overlaid by a * dirent the two low bits of the hash version will be zero. Therefore, the * hash version mod 4 should never be 0. Sincerely, the paranoia department. */ struct dx_root { struct fake_dirent dot; char dot_name[4]; struct fake_dirent dotdot; char dotdot_name[4]; struct dx_root_info { __le32 reserved_zero; u8 hash_version; u8 info_length; /* 8 */ u8 indirect_levels; u8 unused_flags; } info; struct dx_entry entries[]; }; struct dx_node { struct fake_dirent fake; struct dx_entry entries[]; }; struct dx_frame { struct buffer_head *bh; struct dx_entry *entries; struct dx_entry *at; }; struct dx_map_entry { u32 hash; u16 offs; u16 size; }; /* * This goes at the end of each htree block. */ struct dx_tail { u32 dt_reserved; __le32 dt_checksum; /* crc32c(uuid+inum+dirblock) */ }; static inline ext4_lblk_t dx_get_block(struct dx_entry *entry); static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value); static inline unsigned dx_get_hash(struct dx_entry *entry); static void dx_set_hash(struct dx_entry *entry, unsigned value); static unsigned dx_get_count(struct dx_entry *entries); static unsigned dx_get_limit(struct dx_entry *entries); static void dx_set_count(struct dx_entry *entries, unsigned value); static void dx_set_limit(struct dx_entry *entries, unsigned value); static unsigned dx_root_limit(struct inode *dir, unsigned infosize); static unsigned dx_node_limit(struct inode *dir); static struct dx_frame *dx_probe(struct ext4_filename *fname, struct inode *dir, struct dx_hash_info *hinfo, struct dx_frame *frame); static void dx_release(struct dx_frame *frames); static int dx_make_map(struct inode *dir, struct buffer_head *bh, struct dx_hash_info *hinfo, struct dx_map_entry *map_tail); static void dx_sort_map(struct dx_map_entry *map, unsigned count); static struct ext4_dir_entry_2 *dx_move_dirents(struct inode *dir, char *from, char *to, struct dx_map_entry *offsets, int count, unsigned int blocksize); static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base, unsigned int blocksize); static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block); static int ext4_htree_next_block(struct inode *dir, __u32 hash, struct dx_frame *frame, struct dx_frame *frames, __u32 *start_hash); static struct buffer_head * ext4_dx_find_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir); static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode); /* checksumming functions */ void ext4_initialize_dirent_tail(struct buffer_head *bh, unsigned int blocksize) { struct ext4_dir_entry_tail *t = EXT4_DIRENT_TAIL(bh->b_data, blocksize); memset(t, 0, sizeof(struct ext4_dir_entry_tail)); t->det_rec_len = ext4_rec_len_to_disk( sizeof(struct ext4_dir_entry_tail), blocksize); t->det_reserved_ft = EXT4_FT_DIR_CSUM; } /* Walk through a dirent block to find a checksum "dirent" at the tail */ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode, struct buffer_head *bh) { struct ext4_dir_entry_tail *t; int blocksize = EXT4_BLOCK_SIZE(inode->i_sb); #ifdef PARANOID struct ext4_dir_entry *d, *top; d = (struct ext4_dir_entry *)bh->b_data; top = (struct ext4_dir_entry *)(bh->b_data + (blocksize - sizeof(struct ext4_dir_entry_tail))); while (d < top && ext4_rec_len_from_disk(d->rec_len, blocksize)) d = (struct ext4_dir_entry *)(((void *)d) + ext4_rec_len_from_disk(d->rec_len, blocksize)); if (d != top) return NULL; t = (struct ext4_dir_entry_tail *)d; #else t = EXT4_DIRENT_TAIL(bh->b_data, EXT4_BLOCK_SIZE(inode->i_sb)); #endif if (t->det_reserved_zero1 || (ext4_rec_len_from_disk(t->det_rec_len, blocksize) != sizeof(struct ext4_dir_entry_tail)) || t->det_reserved_zero2 || t->det_reserved_ft != EXT4_FT_DIR_CSUM) return NULL; return t; } static __le32 ext4_dirblock_csum(struct inode *inode, void *dirent, int size) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); __u32 csum; csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); return cpu_to_le32(csum); } #define warn_no_space_for_csum(inode) \ __warn_no_space_for_csum((inode), __func__, __LINE__) static void __warn_no_space_for_csum(struct inode *inode, const char *func, unsigned int line) { __ext4_warning_inode(inode, func, line, "No space for directory leaf checksum. Please run e2fsck -D."); } int ext4_dirblock_csum_verify(struct inode *inode, struct buffer_head *bh) { struct ext4_dir_entry_tail *t; if (!ext4_has_metadata_csum(inode->i_sb)) return 1; t = get_dirent_tail(inode, bh); if (!t) { warn_no_space_for_csum(inode); return 0; } if (t->det_checksum != ext4_dirblock_csum(inode, bh->b_data, (char *)t - bh->b_data)) return 0; return 1; } static void ext4_dirblock_csum_set(struct inode *inode, struct buffer_head *bh) { struct ext4_dir_entry_tail *t; if (!ext4_has_metadata_csum(inode->i_sb)) return; t = get_dirent_tail(inode, bh); if (!t) { warn_no_space_for_csum(inode); return; } t->det_checksum = ext4_dirblock_csum(inode, bh->b_data, (char *)t - bh->b_data); } int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode, struct buffer_head *bh) { ext4_dirblock_csum_set(inode, bh); return ext4_handle_dirty_metadata(handle, inode, bh); } static struct dx_countlimit *get_dx_countlimit(struct inode *inode, struct ext4_dir_entry *dirent, int *offset) { struct ext4_dir_entry *dp; struct dx_root_info *root; int count_offset; int blocksize = EXT4_BLOCK_SIZE(inode->i_sb); unsigned int rlen = ext4_rec_len_from_disk(dirent->rec_len, blocksize); if (rlen == blocksize) count_offset = 8; else if (rlen == 12) { dp = (struct ext4_dir_entry *)(((void *)dirent) + 12); if (ext4_rec_len_from_disk(dp->rec_len, blocksize) != blocksize - 12) return NULL; root = (struct dx_root_info *)(((void *)dp + 12)); if (root->reserved_zero || root->info_length != sizeof(struct dx_root_info)) return NULL; count_offset = 32; } else return NULL; if (offset) *offset = count_offset; return (struct dx_countlimit *)(((void *)dirent) + count_offset); } static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent, int count_offset, int count, struct dx_tail *t) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); __u32 csum; int size; __u32 dummy_csum = 0; int offset = offsetof(struct dx_tail, dt_checksum); size = count_offset + (count * sizeof(struct dx_entry)); csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); csum = ext4_chksum(sbi, csum, (__u8 *)t, offset); csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); return cpu_to_le32(csum); } static int ext4_dx_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent) { struct dx_countlimit *c; struct dx_tail *t; int count_offset, limit, count; if (!ext4_has_metadata_csum(inode->i_sb)) return 1; c = get_dx_countlimit(inode, dirent, &count_offset); if (!c) { EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D."); return 0; } limit = le16_to_cpu(c->limit); count = le16_to_cpu(c->count); if (count_offset + (limit * sizeof(struct dx_entry)) > EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { warn_no_space_for_csum(inode); return 0; } t = (struct dx_tail *)(((struct dx_entry *)c) + limit); if (t->dt_checksum != ext4_dx_csum(inode, dirent, count_offset, count, t)) return 0; return 1; } static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent) { struct dx_countlimit *c; struct dx_tail *t; int count_offset, limit, count; if (!ext4_has_metadata_csum(inode->i_sb)) return; c = get_dx_countlimit(inode, dirent, &count_offset); if (!c) { EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D."); return; } limit = le16_to_cpu(c->limit); count = le16_to_cpu(c->count); if (count_offset + (limit * sizeof(struct dx_entry)) > EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { warn_no_space_for_csum(inode); return; } t = (struct dx_tail *)(((struct dx_entry *)c) + limit); t->dt_checksum = ext4_dx_csum(inode, dirent, count_offset, count, t); } static inline int ext4_handle_dirty_dx_node(handle_t *handle, struct inode *inode, struct buffer_head *bh) { ext4_dx_csum_set(inode, (struct ext4_dir_entry *)bh->b_data); return ext4_handle_dirty_metadata(handle, inode, bh); } /* * p is at least 6 bytes before the end of page */ static inline struct ext4_dir_entry_2 * ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize) { return (struct ext4_dir_entry_2 *)((char *)p + ext4_rec_len_from_disk(p->rec_len, blocksize)); } /* * Future: use high four bits of block for coalesce-on-delete flags * Mask them off for now. */ static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) { return le32_to_cpu(entry->block) & 0x0fffffff; } static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) { entry->block = cpu_to_le32(value); } static inline unsigned dx_get_hash(struct dx_entry *entry) { return le32_to_cpu(entry->hash); } static inline void dx_set_hash(struct dx_entry *entry, unsigned value) { entry->hash = cpu_to_le32(value); } static inline unsigned dx_get_count(struct dx_entry *entries) { return le16_to_cpu(((struct dx_countlimit *) entries)->count); } static inline unsigned dx_get_limit(struct dx_entry *entries) { return le16_to_cpu(((struct dx_countlimit *) entries)->limit); } static inline void dx_set_count(struct dx_entry *entries, unsigned value) { ((struct dx_countlimit *) entries)->count = cpu_to_le16(value); } static inline void dx_set_limit(struct dx_entry *entries, unsigned value) { ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); } static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) { unsigned int entry_space = dir->i_sb->s_blocksize - ext4_dir_rec_len(1, NULL) - ext4_dir_rec_len(2, NULL) - infosize; if (ext4_has_metadata_csum(dir->i_sb)) entry_space -= sizeof(struct dx_tail); return entry_space / sizeof(struct dx_entry); } static inline unsigned dx_node_limit(struct inode *dir) { unsigned int entry_space = dir->i_sb->s_blocksize - ext4_dir_rec_len(0, dir); if (ext4_has_metadata_csum(dir->i_sb)) entry_space -= sizeof(struct dx_tail); return entry_space / sizeof(struct dx_entry); } /* * Debug */ #ifdef DX_DEBUG static void dx_show_index(char * label, struct dx_entry *entries) { int i, n = dx_get_count (entries); printk(KERN_DEBUG "%s index", label); for (i = 0; i < n; i++) { printk(KERN_CONT " %x->%lu", i ? dx_get_hash(entries + i) : 0, (unsigned long)dx_get_block(entries + i)); } printk(KERN_CONT "\n"); } struct stats { unsigned names; unsigned space; unsigned bcount; }; static struct stats dx_show_leaf(struct inode *dir, struct dx_hash_info *hinfo, struct ext4_dir_entry_2 *de, int size, int show_names) { unsigned names = 0, space = 0; char *base = (char *) de; struct dx_hash_info h = *hinfo; printk("names: "); while ((char *) de < base + size) { if (de->inode) { if (show_names) { #ifdef CONFIG_FS_ENCRYPTION int len; char *name; struct fscrypt_str fname_crypto_str = FSTR_INIT(NULL, 0); int res = 0; name = de->name; len = de->name_len; if (!IS_ENCRYPTED(dir)) { /* Directory is not encrypted */ (void) ext4fs_dirhash(dir, de->name, de->name_len, &h); printk("%*.s:(U)%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); } else { struct fscrypt_str de_name = FSTR_INIT(name, len); /* Directory is encrypted */ res = fscrypt_fname_alloc_buffer( len, &fname_crypto_str); if (res) printk(KERN_WARNING "Error " "allocating crypto " "buffer--skipping " "crypto\n"); res = fscrypt_fname_disk_to_usr(dir, 0, 0, &de_name, &fname_crypto_str); if (res) { printk(KERN_WARNING "Error " "converting filename " "from disk to usr" "\n"); name = "??"; len = 2; } else { name = fname_crypto_str.name; len = fname_crypto_str.len; } if (IS_CASEFOLDED(dir)) h.hash = EXT4_DIRENT_HASH(de); else (void) ext4fs_dirhash(dir, de->name, de->name_len, &h); printk("%*.s:(E)%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); fscrypt_fname_free_buffer( &fname_crypto_str); } #else int len = de->name_len; char *name = de->name; (void) ext4fs_dirhash(dir, de->name, de->name_len, &h); printk("%*.s:%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); #endif } space += ext4_dir_rec_len(de->name_len, dir); names++; } de = ext4_next_entry(de, size); } printk(KERN_CONT "(%i)\n", names); return (struct stats) { names, space, 1 }; } struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, struct dx_entry *entries, int levels) { unsigned blocksize = dir->i_sb->s_blocksize; unsigned count = dx_get_count(entries), names = 0, space = 0, i; unsigned bcount = 0; struct buffer_head *bh; printk("%i indexed blocks...\n", count); for (i = 0; i < count; i++, entries++) { ext4_lblk_t block = dx_get_block(entries); ext4_lblk_t hash = i ? dx_get_hash(entries): 0; u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; struct stats stats; printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); bh = ext4_bread(NULL,dir, block, 0); if (!bh || IS_ERR(bh)) continue; stats = levels? dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0); names += stats.names; space += stats.space; bcount += stats.bcount; brelse(bh); } if (bcount) printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", levels ? "" : " ", names, space/bcount, (space/bcount)*100/blocksize); return (struct stats) { names, space, bcount}; } /* * Linear search cross check */ static inline void htree_rep_invariant_check(struct dx_entry *at, struct dx_entry *target, u32 hash, unsigned int n) { while (n--) { dxtrace(printk(KERN_CONT ",")); if (dx_get_hash(++at) > hash) { at--; break; } } ASSERT(at == target - 1); } #else /* DX_DEBUG */ static inline void htree_rep_invariant_check(struct dx_entry *at, struct dx_entry *target, u32 hash, unsigned int n) { } #endif /* DX_DEBUG */ /* * Probe for a directory leaf block to search. * * dx_probe can return ERR_BAD_DX_DIR, which means there was a format * error in the directory index, and the caller should fall back to * searching the directory normally. The callers of dx_probe **MUST** * check for this error code, and make sure it never gets reflected * back to userspace. */ static struct dx_frame * dx_probe(struct ext4_filename *fname, struct inode *dir, struct dx_hash_info *hinfo, struct dx_frame *frame_in) { unsigned count, indirect, level, i; struct dx_entry *at, *entries, *p, *q, *m; struct dx_root *root; struct dx_frame *frame = frame_in; struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); u32 hash; ext4_lblk_t block; ext4_lblk_t blocks[EXT4_HTREE_LEVEL]; memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0])); frame->bh = ext4_read_dirblock(dir, 0, INDEX); if (IS_ERR(frame->bh)) return (struct dx_frame *) frame->bh; root = (struct dx_root *) frame->bh->b_data; if (root->info.hash_version != DX_HASH_TEA && root->info.hash_version != DX_HASH_HALF_MD4 && root->info.hash_version != DX_HASH_LEGACY && root->info.hash_version != DX_HASH_SIPHASH) { ext4_warning_inode(dir, "Unrecognised inode hash code %u", root->info.hash_version); goto fail; } if (ext4_hash_in_dirent(dir)) { if (root->info.hash_version != DX_HASH_SIPHASH) { ext4_warning_inode(dir, "Hash in dirent, but hash is not SIPHASH"); goto fail; } } else { if (root->info.hash_version == DX_HASH_SIPHASH) { ext4_warning_inode(dir, "Hash code is SIPHASH, but hash not in dirent"); goto fail; } } if (fname) hinfo = &fname->hinfo; hinfo->hash_version = root->info.hash_version; if (hinfo->hash_version <= DX_HASH_TEA) hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; /* hash is already computed for encrypted casefolded directory */ if (fname && fname_name(fname) && !(IS_ENCRYPTED(dir) && IS_CASEFOLDED(dir))) { int ret = ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), hinfo); if (ret < 0) { ret_err = ERR_PTR(ret); goto fail; } } hash = hinfo->hash; if (root->info.unused_flags & 1) { ext4_warning_inode(dir, "Unimplemented hash flags: %#06x", root->info.unused_flags); goto fail; } indirect = root->info.indirect_levels; if (indirect >= ext4_dir_htree_level(dir->i_sb)) { ext4_warning(dir->i_sb, "Directory (ino: %lu) htree depth %#06x exceed" "supported value", dir->i_ino, ext4_dir_htree_level(dir->i_sb)); if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) { ext4_warning(dir->i_sb, "Enable large directory " "feature to access it"); } goto fail; } entries = (struct dx_entry *)(((char *)&root->info) + root->info.info_length); if (dx_get_limit(entries) != dx_root_limit(dir, root->info.info_length)) { ext4_warning_inode(dir, "dx entry: limit %u != root limit %u", dx_get_limit(entries), dx_root_limit(dir, root->info.info_length)); goto fail; } dxtrace(printk("Look up %x", hash)); level = 0; blocks[0] = 0; while (1) { count = dx_get_count(entries); if (!count || count > dx_get_limit(entries)) { ext4_warning_inode(dir, "dx entry: count %u beyond limit %u", count, dx_get_limit(entries)); goto fail; } p = entries + 1; q = entries + count - 1; while (p <= q) { m = p + (q - p) / 2; dxtrace(printk(KERN_CONT ".")); if (dx_get_hash(m) > hash) q = m - 1; else p = m + 1; } htree_rep_invariant_check(entries, p, hash, count - 1); at = p - 1; dxtrace(printk(KERN_CONT " %x->%u\n", at == entries ? 0 : dx_get_hash(at), dx_get_block(at))); frame->entries = entries; frame->at = at; block = dx_get_block(at); for (i = 0; i <= level; i++) { if (blocks[i] == block) { ext4_warning_inode(dir, "dx entry: tree cycle block %u points back to block %u", blocks[level], block); goto fail; } } if (++level > indirect) return frame; blocks[level] = block; frame++; frame->bh = ext4_read_dirblock(dir, block, INDEX); if (IS_ERR(frame->bh)) { ret_err = (struct dx_frame *) frame->bh; frame->bh = NULL; goto fail; } entries = ((struct dx_node *) frame->bh->b_data)->entries; if (dx_get_limit(entries) != dx_node_limit(dir)) { ext4_warning_inode(dir, "dx entry: limit %u != node limit %u", dx_get_limit(entries), dx_node_limit(dir)); goto fail; } } fail: while (frame >= frame_in) { brelse(frame->bh); frame--; } if (ret_err == ERR_PTR(ERR_BAD_DX_DIR)) ext4_warning_inode(dir, "Corrupt directory, running e2fsck is recommended"); return ret_err; } static void dx_release(struct dx_frame *frames) { struct dx_root_info *info; int i; unsigned int indirect_levels; if (frames[0].bh == NULL) return; info = &((struct dx_root *)frames[0].bh->b_data)->info; /* save local copy, "info" may be freed after brelse() */ indirect_levels = info->indirect_levels; for (i = 0; i <= indirect_levels; i++) { if (frames[i].bh == NULL) break; brelse(frames[i].bh); frames[i].bh = NULL; } } /* * This function increments the frame pointer to search the next leaf * block, and reads in the necessary intervening nodes if the search * should be necessary. Whether or not the search is necessary is * controlled by the hash parameter. If the hash value is even, then * the search is only continued if the next block starts with that * hash value. This is used if we are searching for a specific file. * * If the hash value is HASH_NB_ALWAYS, then always go to the next block. * * This function returns 1 if the caller should continue to search, * or 0 if it should not. If there is an error reading one of the * index blocks, it will a negative error code. * * If start_hash is non-null, it will be filled in with the starting * hash of the next page. */ static int ext4_htree_next_block(struct inode *dir, __u32 hash, struct dx_frame *frame, struct dx_frame *frames, __u32 *start_hash) { struct dx_frame *p; struct buffer_head *bh; int num_frames = 0; __u32 bhash; p = frame; /* * Find the next leaf page by incrementing the frame pointer. * If we run out of entries in the interior node, loop around and * increment pointer in the parent node. When we break out of * this loop, num_frames indicates the number of interior * nodes need to be read. */ while (1) { if (++(p->at) < p->entries + dx_get_count(p->entries)) break; if (p == frames) return 0; num_frames++; p--; } /* * If the hash is 1, then continue only if the next page has a * continuation hash of any value. This is used for readdir * handling. Otherwise, check to see if the hash matches the * desired continuation hash. If it doesn't, return since * there's no point to read in the successive index pages. */ bhash = dx_get_hash(p->at); if (start_hash) *start_hash = bhash; if ((hash & 1) == 0) { if ((bhash & ~1) != hash) return 0; } /* * If the hash is HASH_NB_ALWAYS, we always go to the next * block so no check is necessary */ while (num_frames--) { bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX); if (IS_ERR(bh)) return PTR_ERR(bh); p++; brelse(p->bh); p->bh = bh; p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; } return 1; } /* * This function fills a red-black tree with information from a * directory block. It returns the number directory entries loaded * into the tree. If there is an error it is returned in err. */ static int htree_dirblock_to_tree(struct file *dir_file, struct inode *dir, ext4_lblk_t block, struct dx_hash_info *hinfo, __u32 start_hash, __u32 start_minor_hash) { struct buffer_head *bh; struct ext4_dir_entry_2 *de, *top; int err = 0, count = 0; struct fscrypt_str fname_crypto_str = FSTR_INIT(NULL, 0), tmp_str; int csum = ext4_has_metadata_csum(dir->i_sb); dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", (unsigned long)block)); bh = ext4_read_dirblock(dir, block, DIRENT_HTREE); if (IS_ERR(bh)) return PTR_ERR(bh); de = (struct ext4_dir_entry_2 *) bh->b_data; /* csum entries are not larger in the casefolded encrypted case */ top = (struct ext4_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize - ext4_dir_rec_len(0, csum ? NULL : dir)); /* Check if the directory is encrypted */ if (IS_ENCRYPTED(dir)) { err = fscrypt_prepare_readdir(dir); if (err < 0) { brelse(bh); return err; } err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN, &fname_crypto_str); if (err < 0) { brelse(bh); return err; } } for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, bh->b_size, (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) + ((char *)de - bh->b_data))) { /* silently ignore the rest of the block */ break; } if (ext4_hash_in_dirent(dir)) { if (de->name_len && de->inode) { hinfo->hash = EXT4_DIRENT_HASH(de); hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de); } else { hinfo->hash = 0; hinfo->minor_hash = 0; } } else { err = ext4fs_dirhash(dir, de->name, de->name_len, hinfo); if (err < 0) { count = err; goto errout; } } if ((hinfo->hash < start_hash) || ((hinfo->hash == start_hash) && (hinfo->minor_hash < start_minor_hash))) continue; if (de->inode == 0) continue; if (!IS_ENCRYPTED(dir)) { tmp_str.name = de->name; tmp_str.len = de->name_len; err = ext4_htree_store_dirent(dir_file, hinfo->hash, hinfo->minor_hash, de, &tmp_str); } else { int save_len = fname_crypto_str.len; struct fscrypt_str de_name = FSTR_INIT(de->name, de->name_len); /* Directory is encrypted */ err = fscrypt_fname_disk_to_usr(dir, hinfo->hash, hinfo->minor_hash, &de_name, &fname_crypto_str); if (err) { count = err; goto errout; } err = ext4_htree_store_dirent(dir_file, hinfo->hash, hinfo->minor_hash, de, &fname_crypto_str); fname_crypto_str.len = save_len; } if (err != 0) { count = err; goto errout; } count++; } errout: brelse(bh); fscrypt_fname_free_buffer(&fname_crypto_str); return count; } /* * This function fills a red-black tree with information from a * directory. We start scanning the directory in hash order, starting * at start_hash and start_minor_hash. * * This function returns the number of entries inserted into the tree, * or a negative error code. */ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, __u32 start_minor_hash, __u32 *next_hash) { struct dx_hash_info hinfo; struct ext4_dir_entry_2 *de; struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct inode *dir; ext4_lblk_t block; int count = 0; int ret, err; __u32 hashval; struct fscrypt_str tmp_str; dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", start_hash, start_minor_hash)); dir = file_inode(dir_file); if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) { if (ext4_hash_in_dirent(dir)) hinfo.hash_version = DX_HASH_SIPHASH; else hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; if (hinfo.hash_version <= DX_HASH_TEA) hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; if (ext4_has_inline_data(dir)) { int has_inline_data = 1; count = ext4_inlinedir_to_tree(dir_file, dir, 0, &hinfo, start_hash, start_minor_hash, &has_inline_data); if (has_inline_data) { *next_hash = ~0; return count; } } count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, start_hash, start_minor_hash); *next_hash = ~0; return count; } hinfo.hash = start_hash; hinfo.minor_hash = 0; frame = dx_probe(NULL, dir, &hinfo, frames); if (IS_ERR(frame)) return PTR_ERR(frame); /* Add '.' and '..' from the htree header */ if (!start_hash && !start_minor_hash) { de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; tmp_str.name = de->name; tmp_str.len = de->name_len; err = ext4_htree_store_dirent(dir_file, 0, 0, de, &tmp_str); if (err != 0) goto errout; count++; } if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) { de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; de = ext4_next_entry(de, dir->i_sb->s_blocksize); tmp_str.name = de->name; tmp_str.len = de->name_len; err = ext4_htree_store_dirent(dir_file, 2, 0, de, &tmp_str); if (err != 0) goto errout; count++; } while (1) { if (fatal_signal_pending(current)) { err = -ERESTARTSYS; goto errout; } cond_resched(); block = dx_get_block(frame->at); ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo, start_hash, start_minor_hash); if (ret < 0) { err = ret; goto errout; } count += ret; hashval = ~0; ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS, frame, frames, &hashval); *next_hash = hashval; if (ret < 0) { err = ret; goto errout; } /* * Stop if: (a) there are no more entries, or * (b) we have inserted at least one entry and the * next hash value is not a continuation */ if ((ret == 0) || (count && ((hashval & 1) == 0))) break; } dx_release(frames); dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, " "next hash: %x\n", count, *next_hash)); return count; errout: dx_release(frames); return (err); } static inline int search_dirblock(struct buffer_head *bh, struct inode *dir, struct ext4_filename *fname, unsigned int offset, struct ext4_dir_entry_2 **res_dir) { return ext4_search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir, fname, offset, res_dir); } /* * Directory block splitting, compacting */ /* * Create map of hash values, offsets, and sizes, stored at end of block. * Returns number of entries mapped. */ static int dx_make_map(struct inode *dir, struct buffer_head *bh, struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) { int count = 0; struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)bh->b_data; unsigned int buflen = bh->b_size; char *base = bh->b_data; struct dx_hash_info h = *hinfo; int blocksize = EXT4_BLOCK_SIZE(dir->i_sb); if (ext4_has_metadata_csum(dir->i_sb)) buflen -= sizeof(struct ext4_dir_entry_tail); while ((char *) de < base + buflen) { if (ext4_check_dir_entry(dir, NULL, de, bh, base, buflen, ((char *)de) - base)) return -EFSCORRUPTED; if (de->name_len && de->inode) { if (ext4_hash_in_dirent(dir)) h.hash = EXT4_DIRENT_HASH(de); else { int err = ext4fs_dirhash(dir, de->name, de->name_len, &h); if (err < 0) return err; } map_tail--; map_tail->hash = h.hash; map_tail->offs = ((char *) de - base)>>2; map_tail->size = ext4_rec_len_from_disk(de->rec_len, blocksize); count++; cond_resched(); } de = ext4_next_entry(de, blocksize); } return count; } /* Sort map by hash value */ static void dx_sort_map (struct dx_map_entry *map, unsigned count) { struct dx_map_entry *p, *q, *top = map + count - 1; int more; /* Combsort until bubble sort doesn't suck */ while (count > 2) { count = count*10/13; if (count - 9 < 2) /* 9, 10 -> 11 */ count = 11; for (p = top, q = p - count; q >= map; p--, q--) if (p->hash < q->hash) swap(*p, *q); } /* Garden variety bubble sort */ do { more = 0; q = top; while (q-- > map) { if (q[1].hash >= q[0].hash) continue; swap(*(q+1), *q); more = 1; } } while(more); } static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) { struct dx_entry *entries = frame->entries; struct dx_entry *old = frame->at, *new = old + 1; int count = dx_get_count(entries); ASSERT(count < dx_get_limit(entries)); ASSERT(old < entries + count); memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); dx_set_hash(new, hash); dx_set_block(new, block); dx_set_count(entries, count + 1); } #if IS_ENABLED(CONFIG_UNICODE) int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, struct ext4_filename *name) { struct qstr *cf_name = &name->cf_name; unsigned char *buf; struct dx_hash_info *hinfo = &name->hinfo; int len; if (!IS_CASEFOLDED(dir) || (IS_ENCRYPTED(dir) && !fscrypt_has_encryption_key(dir))) { cf_name->name = NULL; return 0; } buf = kmalloc(EXT4_NAME_LEN, GFP_NOFS); if (!buf) return -ENOMEM; len = utf8_casefold(dir->i_sb->s_encoding, iname, buf, EXT4_NAME_LEN); if (len <= 0) { kfree(buf); buf = NULL; } cf_name->name = buf; cf_name->len = (unsigned) len; if (!IS_ENCRYPTED(dir)) return 0; hinfo->hash_version = DX_HASH_SIPHASH; hinfo->seed = NULL; if (cf_name->name) return ext4fs_dirhash(dir, cf_name->name, cf_name->len, hinfo); else return ext4fs_dirhash(dir, iname->name, iname->len, hinfo); } #endif /* * Test whether a directory entry matches the filename being searched for. * * Return: %true if the directory entry matches, otherwise %false. */ static bool ext4_match(struct inode *parent, const struct ext4_filename *fname, struct ext4_dir_entry_2 *de) { struct fscrypt_name f; if (!de->inode) return false; f.usr_fname = fname->usr_fname; f.disk_name = fname->disk_name; #ifdef CONFIG_FS_ENCRYPTION f.crypto_buf = fname->crypto_buf; #endif #if IS_ENABLED(CONFIG_UNICODE) if (IS_CASEFOLDED(parent) && (!IS_ENCRYPTED(parent) || fscrypt_has_encryption_key(parent))) { /* * Just checking IS_ENCRYPTED(parent) below is not * sufficient to decide whether one can use the hash for * skipping the string comparison, because the key might * have been added right after * ext4_fname_setup_ci_filename(). In this case, a hash * mismatch will be a false negative. Therefore, make * sure cf_name was properly initialized before * considering the calculated hash. */ if (IS_ENCRYPTED(parent) && fname->cf_name.name && (fname->hinfo.hash != EXT4_DIRENT_HASH(de) || fname->hinfo.minor_hash != EXT4_DIRENT_MINOR_HASH(de))) return false; /* * Treat comparison errors as not a match. The * only case where it happens is on a disk * corruption or ENOMEM. */ return generic_ci_match(parent, fname->usr_fname, &fname->cf_name, de->name, de->name_len) > 0; } #endif return fscrypt_match_name(&f, de->name, de->name_len); } /* * Returns 0 if not found, -EFSCORRUPTED on failure, and 1 on success */ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, struct inode *dir, struct ext4_filename *fname, unsigned int offset, struct ext4_dir_entry_2 **res_dir) { struct ext4_dir_entry_2 * de; char * dlimit; int de_len; de = (struct ext4_dir_entry_2 *)search_buf; dlimit = search_buf + buf_size; while ((char *) de < dlimit - EXT4_BASE_DIR_LEN) { /* this code is executed quadratically often */ /* do minimal checking `by hand' */ if (de->name + de->name_len <= dlimit && ext4_match(dir, fname, de)) { /* found a match - just to be sure, do * a full check */ if (ext4_check_dir_entry(dir, NULL, de, bh, search_buf, buf_size, offset)) return -EFSCORRUPTED; *res_dir = de; return 1; } /* prevent looping on a bad block */ de_len = ext4_rec_len_from_disk(de->rec_len, dir->i_sb->s_blocksize); if (de_len <= 0) return -EFSCORRUPTED; offset += de_len; de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); } return 0; } static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block, struct ext4_dir_entry *de) { struct super_block *sb = dir->i_sb; if (!is_dx(dir)) return 0; if (block == 0) return 1; if (de->inode == 0 && ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) == sb->s_blocksize) return 1; return 0; } /* * __ext4_find_entry() * * finds an entry in the specified directory with the wanted name. It * returns the cache buffer in which the entry was found, and the entry * itself (as a parameter - res_dir). It does NOT read the inode of the * entry - you'll have to do that yourself if you want to. * * The returned buffer_head has ->b_count elevated. The caller is expected * to brelse() it when appropriate. */ static struct buffer_head *__ext4_find_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir, int *inlined) { struct super_block *sb; struct buffer_head *bh_use[NAMEI_RA_SIZE]; struct buffer_head *bh, *ret = NULL; ext4_lblk_t start, block; const u8 *name = fname->usr_fname->name; size_t ra_max = 0; /* Number of bh's in the readahead buffer, bh_use[] */ size_t ra_ptr = 0; /* Current index into readahead buffer */ ext4_lblk_t nblocks; int i, namelen, retval; *res_dir = NULL; sb = dir->i_sb; namelen = fname->usr_fname->len; if (namelen > EXT4_NAME_LEN) return NULL; if (ext4_has_inline_data(dir)) { int has_inline_data = 1; ret = ext4_find_inline_entry(dir, fname, res_dir, &has_inline_data); if (inlined) *inlined = has_inline_data; if (has_inline_data || IS_ERR(ret)) goto cleanup_and_exit; } if ((namelen <= 2) && (name[0] == '.') && (name[1] == '.' || name[1] == '\0')) { /* * "." or ".." will only be in the first block * NFS may look up ".."; "." should be handled by the VFS */ block = start = 0; nblocks = 1; goto restart; } if (is_dx(dir)) { ret = ext4_dx_find_entry(dir, fname, res_dir); /* * On success, or if the error was file not found, * return. Otherwise, fall back to doing a search the * old fashioned way. */ if (!IS_ERR(ret) || PTR_ERR(ret) != ERR_BAD_DX_DIR) goto cleanup_and_exit; dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " "falling back\n")); ret = NULL; } nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); if (!nblocks) { ret = NULL; goto cleanup_and_exit; } start = EXT4_I(dir)->i_dir_start_lookup; if (start >= nblocks) start = 0; block = start; restart: do { /* * We deal with the read-ahead logic here. */ cond_resched(); if (ra_ptr >= ra_max) { /* Refill the readahead buffer */ ra_ptr = 0; if (block < start) ra_max = start - block; else ra_max = nblocks - block; ra_max = min(ra_max, ARRAY_SIZE(bh_use)); retval = ext4_bread_batch(dir, block, ra_max, false /* wait */, bh_use); if (retval) { ret = ERR_PTR(retval); ra_max = 0; goto cleanup_and_exit; } } if ((bh = bh_use[ra_ptr++]) == NULL) goto next; wait_on_buffer(bh); if (!buffer_uptodate(bh)) { EXT4_ERROR_INODE_ERR(dir, EIO, "reading directory lblock %lu", (unsigned long) block); brelse(bh); ret = ERR_PTR(-EIO); goto cleanup_and_exit; } if (!buffer_verified(bh) && !is_dx_internal_node(dir, block, (struct ext4_dir_entry *)bh->b_data) && !ext4_dirblock_csum_verify(dir, bh)) { EXT4_ERROR_INODE_ERR(dir, EFSBADCRC, "checksumming directory " "block %lu", (unsigned long)block); brelse(bh); ret = ERR_PTR(-EFSBADCRC); goto cleanup_and_exit; } set_buffer_verified(bh); i = search_dirblock(bh, dir, fname, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); if (i == 1) { EXT4_I(dir)->i_dir_start_lookup = block; ret = bh; goto cleanup_and_exit; } else { brelse(bh); if (i < 0) { ret = ERR_PTR(i); goto cleanup_and_exit; } } next: if (++block >= nblocks) block = 0; } while (block != start); /* * If the directory has grown while we were searching, then * search the last part of the directory before giving up. */ block = nblocks; nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); if (block < nblocks) { start = 0; goto restart; } cleanup_and_exit: /* Clean up the read-ahead blocks */ for (; ra_ptr < ra_max; ra_ptr++) brelse(bh_use[ra_ptr]); return ret; } static struct buffer_head *ext4_find_entry(struct inode *dir, const struct qstr *d_name, struct ext4_dir_entry_2 **res_dir, int *inlined) { int err; struct ext4_filename fname; struct buffer_head *bh; err = ext4_fname_setup_filename(dir, d_name, 1, &fname); if (err == -ENOENT) return NULL; if (err) return ERR_PTR(err); bh = __ext4_find_entry(dir, &fname, res_dir, inlined); ext4_fname_free_filename(&fname); return bh; } static struct buffer_head *ext4_lookup_entry(struct inode *dir, struct dentry *dentry, struct ext4_dir_entry_2 **res_dir) { int err; struct ext4_filename fname; struct buffer_head *bh; err = ext4_fname_prepare_lookup(dir, dentry, &fname); if (err == -ENOENT) return NULL; if (err) return ERR_PTR(err); bh = __ext4_find_entry(dir, &fname, res_dir, NULL); ext4_fname_free_filename(&fname); return bh; } static struct buffer_head * ext4_dx_find_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir) { struct super_block * sb = dir->i_sb; struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct buffer_head *bh; ext4_lblk_t block; int retval; #ifdef CONFIG_FS_ENCRYPTION *res_dir = NULL; #endif frame = dx_probe(fname, dir, NULL, frames); if (IS_ERR(frame)) return ERR_CAST(frame); do { block = dx_get_block(frame->at); bh = ext4_read_dirblock(dir, block, DIRENT_HTREE); if (IS_ERR(bh)) goto errout; retval = search_dirblock(bh, dir, fname, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); if (retval == 1) goto success; brelse(bh); if (retval < 0) { bh = ERR_PTR(ERR_BAD_DX_DIR); goto errout; } /* Check to see if we should continue to search */ retval = ext4_htree_next_block(dir, fname->hinfo.hash, frame, frames, NULL); if (retval < 0) { ext4_warning_inode(dir, "error %d reading directory index block", retval); bh = ERR_PTR(retval); goto errout; } } while (retval == 1); bh = NULL; errout: dxtrace(printk(KERN_DEBUG "%s not found\n", fname->usr_fname->name)); success: dx_release(frames); return bh; } static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct inode *inode; struct ext4_dir_entry_2 *de; struct buffer_head *bh; if (dentry->d_name.len > EXT4_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); bh = ext4_lookup_entry(dir, dentry, &de); if (IS_ERR(bh)) return ERR_CAST(bh); inode = NULL; if (bh) { __u32 ino = le32_to_cpu(de->inode); brelse(bh); if (!ext4_valid_inum(dir->i_sb, ino)) { EXT4_ERROR_INODE(dir, "bad inode number: %u", ino); return ERR_PTR(-EFSCORRUPTED); } if (unlikely(ino == dir->i_ino)) { EXT4_ERROR_INODE(dir, "'%pd' linked to parent dir", dentry); return ERR_PTR(-EFSCORRUPTED); } inode = ext4_iget(dir->i_sb, ino, EXT4_IGET_NORMAL); if (inode == ERR_PTR(-ESTALE)) { EXT4_ERROR_INODE(dir, "deleted inode referenced: %u", ino); return ERR_PTR(-EFSCORRUPTED); } if (!IS_ERR(inode) && IS_ENCRYPTED(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !fscrypt_has_permitted_context(dir, inode)) { ext4_warning(inode->i_sb, "Inconsistent encryption contexts: %lu/%lu", dir->i_ino, inode->i_ino); iput(inode); return ERR_PTR(-EPERM); } } if (IS_ENABLED(CONFIG_UNICODE) && !inode && IS_CASEFOLDED(dir)) { /* Eventually we want to call d_add_ci(dentry, NULL) * for negative dentries in the encoding case as * well. For now, prevent the negative dentry * from being cached. */ return NULL; } return d_splice_alias(inode, dentry); } struct dentry *ext4_get_parent(struct dentry *child) { __u32 ino; struct ext4_dir_entry_2 * de; struct buffer_head *bh; bh = ext4_find_entry(d_inode(child), &dotdot_name, &de, NULL); if (IS_ERR(bh)) return ERR_CAST(bh); if (!bh) return ERR_PTR(-ENOENT); ino = le32_to_cpu(de->inode); brelse(bh); if (!ext4_valid_inum(child->d_sb, ino)) { EXT4_ERROR_INODE(d_inode(child), "bad parent inode number: %u", ino); return ERR_PTR(-EFSCORRUPTED); } return d_obtain_alias(ext4_iget(child->d_sb, ino, EXT4_IGET_NORMAL)); } /* * Move count entries from end of map between two memory locations. * Returns pointer to last entry moved. */ static struct ext4_dir_entry_2 * dx_move_dirents(struct inode *dir, char *from, char *to, struct dx_map_entry *map, int count, unsigned blocksize) { unsigned rec_len = 0; while (count--) { struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) (from + (map->offs<<2)); rec_len = ext4_dir_rec_len(de->name_len, dir); memcpy (to, de, rec_len); ((struct ext4_dir_entry_2 *) to)->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); /* wipe dir_entry excluding the rec_len field */ de->inode = 0; memset(&de->name_len, 0, ext4_rec_len_from_disk(de->rec_len, blocksize) - offsetof(struct ext4_dir_entry_2, name_len)); map++; to += rec_len; } return (struct ext4_dir_entry_2 *) (to - rec_len); } /* * Compact each dir entry in the range to the minimal rec_len. * Returns pointer to last entry in range. */ static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base, unsigned int blocksize) { struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base; unsigned rec_len = 0; prev = to = de; while ((char*)de < base + blocksize) { next = ext4_next_entry(de, blocksize); if (de->inode && de->name_len) { rec_len = ext4_dir_rec_len(de->name_len, dir); if (de > to) memmove(to, de, rec_len); to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); prev = to; to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len); } de = next; } return prev; } /* * Split a full leaf block to make room for a new dir entry. * Allocate a new block, and move entries so that they are approx. equally full. * Returns pointer to de in block into which the new entry will be inserted. */ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, struct buffer_head **bh,struct dx_frame *frame, struct dx_hash_info *hinfo) { unsigned blocksize = dir->i_sb->s_blocksize; unsigned continued; int count; struct buffer_head *bh2; ext4_lblk_t newblock; u32 hash2; struct dx_map_entry *map; char *data1 = (*bh)->b_data, *data2; unsigned split, move, size; struct ext4_dir_entry_2 *de = NULL, *de2; int csum_size = 0; int err = 0, i; if (ext4_has_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); bh2 = ext4_append(handle, dir, &newblock); if (IS_ERR(bh2)) { brelse(*bh); *bh = NULL; return ERR_CAST(bh2); } BUFFER_TRACE(*bh, "get_write_access"); err = ext4_journal_get_write_access(handle, dir->i_sb, *bh, EXT4_JTR_NONE); if (err) goto journal_error; BUFFER_TRACE(frame->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, dir->i_sb, frame->bh, EXT4_JTR_NONE); if (err) goto journal_error; data2 = bh2->b_data; /* create map in the end of data2 block */ map = (struct dx_map_entry *) (data2 + blocksize); count = dx_make_map(dir, *bh, hinfo, map); if (count < 0) { err = count; goto journal_error; } map -= count; dx_sort_map(map, count); /* Ensure that neither split block is over half full */ size = 0; move = 0; for (i = count-1; i >= 0; i--) { /* is more than half of this entry in 2nd half of the block? */ if (size + map[i].size/2 > blocksize/2) break; size += map[i].size; move++; } /* * map index at which we will split * * If the sum of active entries didn't exceed half the block size, just * split it in half by count; each resulting block will have at least * half the space free. */ if (i > 0) split = count - move; else split = count/2; if (WARN_ON_ONCE(split == 0)) { /* Should never happen, but avoid out-of-bounds access below */ ext4_error_inode_block(dir, (*bh)->b_blocknr, 0, "bad indexed directory? hash=%08x:%08x count=%d move=%u", hinfo->hash, hinfo->minor_hash, count, move); err = -EFSCORRUPTED; goto out; } hash2 = map[split].hash; continued = hash2 == map[split - 1].hash; dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n", (unsigned long)dx_get_block(frame->at), hash2, split, count-split)); /* Fancy dance to stay within two buffers */ de2 = dx_move_dirents(dir, data1, data2, map + split, count - split, blocksize); de = dx_pack_dirents(dir, data1, blocksize); de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) - (char *) de, blocksize); de2->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) - (char *) de2, blocksize); if (csum_size) { ext4_initialize_dirent_tail(*bh, blocksize); ext4_initialize_dirent_tail(bh2, blocksize); } dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); /* Which block gets the new entry? */ if (hinfo->hash >= hash2) { swap(*bh, bh2); de = de2; } dx_insert_block(frame, hash2 + continued, newblock); err = ext4_handle_dirty_dirblock(handle, dir, bh2); if (err) goto journal_error; err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (err) goto journal_error; brelse(bh2); dxtrace(dx_show_index("frame", frame->entries)); return de; journal_error: ext4_std_error(dir->i_sb, err); out: brelse(*bh); brelse(bh2); *bh = NULL; return ERR_PTR(err); } int ext4_find_dest_de(struct inode *dir, struct inode *inode, struct buffer_head *bh, void *buf, int buf_size, struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de) { struct ext4_dir_entry_2 *de; unsigned short reclen = ext4_dir_rec_len(fname_len(fname), dir); int nlen, rlen; unsigned int offset = 0; char *top; de = buf; top = buf + buf_size - reclen; while ((char *) de <= top) { if (ext4_check_dir_entry(dir, NULL, de, bh, buf, buf_size, offset)) return -EFSCORRUPTED; if (ext4_match(dir, fname, de)) return -EEXIST; nlen = ext4_dir_rec_len(de->name_len, dir); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); if ((de->inode ? rlen - nlen : rlen) >= reclen) break; de = (struct ext4_dir_entry_2 *)((char *)de + rlen); offset += rlen; } if ((char *) de > top) return -ENOSPC; *dest_de = de; return 0; } void ext4_insert_dentry(struct inode *dir, struct inode *inode, struct ext4_dir_entry_2 *de, int buf_size, struct ext4_filename *fname) { int nlen, rlen; nlen = ext4_dir_rec_len(de->name_len, dir); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); if (de->inode) { struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen); de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, buf_size); de->rec_len = ext4_rec_len_to_disk(nlen, buf_size); de = de1; } de->file_type = EXT4_FT_UNKNOWN; de->inode = cpu_to_le32(inode->i_ino); ext4_set_de_type(inode->i_sb, de, inode->i_mode); de->name_len = fname_len(fname); memcpy(de->name, fname_name(fname), fname_len(fname)); if (ext4_hash_in_dirent(dir)) { struct dx_hash_info *hinfo = &fname->hinfo; EXT4_DIRENT_HASHES(de)->hash = cpu_to_le32(hinfo->hash); EXT4_DIRENT_HASHES(de)->minor_hash = cpu_to_le32(hinfo->minor_hash); } } /* * Add a new entry into a directory (leaf) block. If de is non-NULL, * it points to a directory entry which is guaranteed to be large * enough for new directory entry. If de is NULL, then * add_dirent_to_buf will attempt search the directory block for * space. It will return -ENOSPC if no space is available, and -EIO * and -EEXIST if directory entry already exists. */ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode, struct ext4_dir_entry_2 *de, struct buffer_head *bh) { unsigned int blocksize = dir->i_sb->s_blocksize; int csum_size = 0; int err, err2; if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); if (!de) { err = ext4_find_dest_de(dir, inode, bh, bh->b_data, blocksize - csum_size, fname, &de); if (err) return err; } BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, dir->i_sb, bh, EXT4_JTR_NONE); if (err) { ext4_std_error(dir->i_sb, err); return err; } /* By now the buffer is marked for journaling */ ext4_insert_dentry(dir, inode, de, blocksize, fname); /* * XXX shouldn't update any times until successful * completion of syscall, but too many callers depend * on this. * * XXX similarly, too many callers depend on * ext4_new_inode() setting the times, but error * recovery deletes the inode, so the worst that can * happen is that the times are slightly out of date * and/or different from the directory change time. */ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); ext4_update_dx_flag(dir); inode_inc_iversion(dir); err2 = ext4_mark_inode_dirty(handle, dir); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_dirblock(handle, dir, bh); if (err) ext4_std_error(dir->i_sb, err); return err ? err : err2; } static bool ext4_check_dx_root(struct inode *dir, struct dx_root *root) { struct fake_dirent *fde; const char *error_msg; unsigned int rlen; unsigned int blocksize = dir->i_sb->s_blocksize; char *blockend = (char *)root + dir->i_sb->s_blocksize; fde = &root->dot; if (unlikely(fde->name_len != 1)) { error_msg = "invalid name_len for '.'"; goto corrupted; } if (unlikely(strncmp(root->dot_name, ".", fde->name_len))) { error_msg = "invalid name for '.'"; goto corrupted; } rlen = ext4_rec_len_from_disk(fde->rec_len, blocksize); if (unlikely((char *)fde + rlen >= blockend)) { error_msg = "invalid rec_len for '.'"; goto corrupted; } fde = &root->dotdot; if (unlikely(fde->name_len != 2)) { error_msg = "invalid name_len for '..'"; goto corrupted; } if (unlikely(strncmp(root->dotdot_name, "..", fde->name_len))) { error_msg = "invalid name for '..'"; goto corrupted; } rlen = ext4_rec_len_from_disk(fde->rec_len, blocksize); if (unlikely((char *)fde + rlen >= blockend)) { error_msg = "invalid rec_len for '..'"; goto corrupted; } return true; corrupted: EXT4_ERROR_INODE(dir, "Corrupt dir, %s, running e2fsck is recommended", error_msg); return false; } /* * This converts a one block unindexed directory to a 3 block indexed * directory, and adds the dentry to the indexed directory. */ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode, struct buffer_head *bh) { struct buffer_head *bh2; struct dx_root *root; struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct dx_entry *entries; struct ext4_dir_entry_2 *de, *de2; char *data2, *top; unsigned len; int retval; unsigned blocksize; ext4_lblk_t block; struct fake_dirent *fde; int csum_size = 0; if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); blocksize = dir->i_sb->s_blocksize; dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino)); BUFFER_TRACE(bh, "get_write_access"); retval = ext4_journal_get_write_access(handle, dir->i_sb, bh, EXT4_JTR_NONE); if (retval) { ext4_std_error(dir->i_sb, retval); brelse(bh); return retval; } root = (struct dx_root *) bh->b_data; if (!ext4_check_dx_root(dir, root)) { brelse(bh); return -EFSCORRUPTED; } /* The 0th block becomes the root, move the dirents out */ fde = &root->dotdot; de = (struct ext4_dir_entry_2 *)((char *)fde + ext4_rec_len_from_disk(fde->rec_len, blocksize)); len = ((char *) root) + (blocksize - csum_size) - (char *) de; /* Allocate new block for the 0th block's dirents */ bh2 = ext4_append(handle, dir, &block); if (IS_ERR(bh2)) { brelse(bh); return PTR_ERR(bh2); } ext4_set_inode_flag(dir, EXT4_INODE_INDEX); data2 = bh2->b_data; memcpy(data2, de, len); memset(de, 0, len); /* wipe old data */ de = (struct ext4_dir_entry_2 *) data2; top = data2 + len; while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) { if (ext4_check_dir_entry(dir, NULL, de, bh2, data2, len, (char *)de - data2)) { brelse(bh2); brelse(bh); return -EFSCORRUPTED; } de = de2; } de->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) - (char *) de, blocksize); if (csum_size) ext4_initialize_dirent_tail(bh2, blocksize); /* Initialize the root; the dot dirents already exist */ de = (struct ext4_dir_entry_2 *) (&root->dotdot); de->rec_len = ext4_rec_len_to_disk( blocksize - ext4_dir_rec_len(2, NULL), blocksize); memset (&root->info, 0, sizeof(root->info)); root->info.info_length = sizeof(root->info); if (ext4_hash_in_dirent(dir)) root->info.hash_version = DX_HASH_SIPHASH; else root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; entries = root->entries; dx_set_block(entries, 1); dx_set_count(entries, 1); dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info))); /* Initialize as for dx_probe */ fname->hinfo.hash_version = root->info.hash_version; if (fname->hinfo.hash_version <= DX_HASH_TEA) fname->hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; /* casefolded encrypted hashes are computed on fname setup */ if (!ext4_hash_in_dirent(dir)) { int err = ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), &fname->hinfo); if (err < 0) { brelse(bh2); brelse(bh); return err; } } memset(frames, 0, sizeof(frames)); frame = frames; frame->entries = entries; frame->at = entries; frame->bh = bh; retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (retval) goto out_frames; retval = ext4_handle_dirty_dirblock(handle, dir, bh2); if (retval) goto out_frames; de = do_split(handle,dir, &bh2, frame, &fname->hinfo); if (IS_ERR(de)) { retval = PTR_ERR(de); goto out_frames; } retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh2); out_frames: /* * Even if the block split failed, we have to properly write * out all the changes we did so far. Otherwise we can end up * with corrupted filesystem. */ if (retval) ext4_mark_inode_dirty(handle, dir); dx_release(frames); brelse(bh2); return retval; } /* * ext4_add_entry() * * adds a file entry to the specified directory, using the same * semantics as ext4_find_entry(). It returns NULL if it failed. * * NOTE!! The inode part of 'de' is left at 0 - which means you * may not sleep between calling this and putting something into * the entry, as someone else might have used it while you slept. */ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode) { struct inode *dir = d_inode(dentry->d_parent); struct buffer_head *bh = NULL; struct ext4_dir_entry_2 *de; struct super_block *sb; struct ext4_filename fname; int retval; int dx_fallback=0; unsigned blocksize; ext4_lblk_t block, blocks; int csum_size = 0; if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); sb = dir->i_sb; blocksize = sb->s_blocksize; if (fscrypt_is_nokey_name(dentry)) return -ENOKEY; if (!generic_ci_validate_strict_name(dir, &dentry->d_name)) return -EINVAL; retval = ext4_fname_setup_filename(dir, &dentry->d_name, 0, &fname); if (retval) return retval; if (ext4_has_inline_data(dir)) { retval = ext4_try_add_inline_entry(handle, &fname, dir, inode); if (retval < 0) goto out; if (retval == 1) { retval = 0; goto out; } } if (is_dx(dir)) { retval = ext4_dx_add_entry(handle, &fname, dir, inode); if (!retval || (retval != ERR_BAD_DX_DIR)) goto out; /* Can we just ignore htree data? */ if (ext4_has_metadata_csum(sb)) { EXT4_ERROR_INODE(dir, "Directory has corrupted htree index."); retval = -EFSCORRUPTED; goto out; } ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); dx_fallback++; retval = ext4_mark_inode_dirty(handle, dir); if (unlikely(retval)) goto out; } blocks = dir->i_size >> sb->s_blocksize_bits; for (block = 0; block < blocks; block++) { bh = ext4_read_dirblock(dir, block, DIRENT); if (bh == NULL) { bh = ext4_bread(handle, dir, block, EXT4_GET_BLOCKS_CREATE); goto add_to_new_block; } if (IS_ERR(bh)) { retval = PTR_ERR(bh); bh = NULL; goto out; } retval = add_dirent_to_buf(handle, &fname, dir, inode, NULL, bh); if (retval != -ENOSPC) goto out; if (blocks == 1 && !dx_fallback && ext4_has_feature_dir_index(sb)) { retval = make_indexed_dir(handle, &fname, dir, inode, bh); bh = NULL; /* make_indexed_dir releases bh */ goto out; } brelse(bh); } bh = ext4_append(handle, dir, &block); add_to_new_block: if (IS_ERR(bh)) { retval = PTR_ERR(bh); bh = NULL; goto out; } de = (struct ext4_dir_entry_2 *) bh->b_data; de->inode = 0; de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize); if (csum_size) ext4_initialize_dirent_tail(bh, blocksize); retval = add_dirent_to_buf(handle, &fname, dir, inode, de, bh); out: ext4_fname_free_filename(&fname); brelse(bh); if (retval == 0) ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY); return retval; } /* * Returns 0 for success, or a negative error value */ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode) { struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct dx_entry *entries, *at; struct buffer_head *bh; struct super_block *sb = dir->i_sb; struct ext4_dir_entry_2 *de; int restart; int err; again: restart = 0; frame = dx_probe(fname, dir, NULL, frames); if (IS_ERR(frame)) return PTR_ERR(frame); entries = frame->entries; at = frame->at; bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT_HTREE); if (IS_ERR(bh)) { err = PTR_ERR(bh); bh = NULL; goto cleanup; } BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE); if (err) goto journal_error; err = add_dirent_to_buf(handle, fname, dir, inode, NULL, bh); if (err != -ENOSPC) goto cleanup; err = 0; /* Block full, should compress but for now just split */ dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", dx_get_count(entries), dx_get_limit(entries))); /* Need to split index? */ if (dx_get_count(entries) == dx_get_limit(entries)) { ext4_lblk_t newblock; int levels = frame - frames + 1; unsigned int icount; int add_level = 1; struct dx_entry *entries2; struct dx_node *node2; struct buffer_head *bh2; while (frame > frames) { if (dx_get_count((frame - 1)->entries) < dx_get_limit((frame - 1)->entries)) { add_level = 0; break; } frame--; /* split higher index block */ at = frame->at; entries = frame->entries; restart = 1; } if (add_level && levels == ext4_dir_htree_level(sb)) { ext4_warning(sb, "Directory (ino: %lu) index full, " "reach max htree level :%d", dir->i_ino, levels); if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) { ext4_warning(sb, "Large directory feature is " "not enabled on this " "filesystem"); } err = -ENOSPC; goto cleanup; } icount = dx_get_count(entries); bh2 = ext4_append(handle, dir, &newblock); if (IS_ERR(bh2)) { err = PTR_ERR(bh2); goto cleanup; } node2 = (struct dx_node *)(bh2->b_data); entries2 = node2->entries; memset(&node2->fake, 0, sizeof(struct fake_dirent)); node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize, sb->s_blocksize); BUFFER_TRACE(frame->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, frame->bh, EXT4_JTR_NONE); if (err) goto journal_error; if (!add_level) { unsigned icount1 = icount/2, icount2 = icount - icount1; unsigned hash2 = dx_get_hash(entries + icount1); dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", icount1, icount2)); BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ err = ext4_journal_get_write_access(handle, sb, (frame - 1)->bh, EXT4_JTR_NONE); if (err) goto journal_error; memcpy((char *) entries2, (char *) (entries + icount1), icount2 * sizeof(struct dx_entry)); dx_set_count(entries, icount1); dx_set_count(entries2, icount2); dx_set_limit(entries2, dx_node_limit(dir)); /* Which index block gets the new entry? */ if (at - entries >= icount1) { frame->at = at - entries - icount1 + entries2; frame->entries = entries = entries2; swap(frame->bh, bh2); } dx_insert_block((frame - 1), hash2, newblock); dxtrace(dx_show_index("node", frame->entries)); dxtrace(dx_show_index("node", ((struct dx_node *) bh2->b_data)->entries)); err = ext4_handle_dirty_dx_node(handle, dir, bh2); if (err) goto journal_error; brelse (bh2); err = ext4_handle_dirty_dx_node(handle, dir, (frame - 1)->bh); if (err) goto journal_error; err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (restart || err) goto journal_error; } else { struct dx_root *dxroot; memcpy((char *) entries2, (char *) entries, icount * sizeof(struct dx_entry)); dx_set_limit(entries2, dx_node_limit(dir)); /* Set up root */ dx_set_count(entries, 1); dx_set_block(entries + 0, newblock); dxroot = (struct dx_root *)frames[0].bh->b_data; dxroot->info.indirect_levels += 1; dxtrace(printk(KERN_DEBUG "Creating %d level index...\n", dxroot->info.indirect_levels)); err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (err) goto journal_error; err = ext4_handle_dirty_dx_node(handle, dir, bh2); brelse(bh2); restart = 1; goto journal_error; } } de = do_split(handle, dir, &bh, frame, &fname->hinfo); if (IS_ERR(de)) { err = PTR_ERR(de); goto cleanup; } err = add_dirent_to_buf(handle, fname, dir, inode, de, bh); goto cleanup; journal_error: ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */ cleanup: brelse(bh); dx_release(frames); /* @restart is true means htree-path has been changed, we need to * repeat dx_probe() to find out valid htree-path */ if (restart && err == 0) goto again; return err; } /* * ext4_generic_delete_entry deletes a directory entry by merging it * with the previous entry */ int ext4_generic_delete_entry(struct inode *dir, struct ext4_dir_entry_2 *de_del, struct buffer_head *bh, void *entry_buf, int buf_size, int csum_size) { struct ext4_dir_entry_2 *de, *pde; unsigned int blocksize = dir->i_sb->s_blocksize; int i; i = 0; pde = NULL; de = entry_buf; while (i < buf_size - csum_size) { if (ext4_check_dir_entry(dir, NULL, de, bh, entry_buf, buf_size, i)) return -EFSCORRUPTED; if (de == de_del) { if (pde) { pde->rec_len = ext4_rec_len_to_disk( ext4_rec_len_from_disk(pde->rec_len, blocksize) + ext4_rec_len_from_disk(de->rec_len, blocksize), blocksize); /* wipe entire dir_entry */ memset(de, 0, ext4_rec_len_from_disk(de->rec_len, blocksize)); } else { /* wipe dir_entry excluding the rec_len field */ de->inode = 0; memset(&de->name_len, 0, ext4_rec_len_from_disk(de->rec_len, blocksize) - offsetof(struct ext4_dir_entry_2, name_len)); } inode_inc_iversion(dir); return 0; } i += ext4_rec_len_from_disk(de->rec_len, blocksize); pde = de; de = ext4_next_entry(de, blocksize); } return -ENOENT; } static int ext4_delete_entry(handle_t *handle, struct inode *dir, struct ext4_dir_entry_2 *de_del, struct buffer_head *bh) { int err, csum_size = 0; if (ext4_has_inline_data(dir)) { int has_inline_data = 1; err = ext4_delete_inline_entry(handle, dir, de_del, bh, &has_inline_data); if (has_inline_data) return err; } if (ext4_has_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, dir->i_sb, bh, EXT4_JTR_NONE); if (unlikely(err)) goto out; err = ext4_generic_delete_entry(dir, de_del, bh, bh->b_data, dir->i_sb->s_blocksize, csum_size); if (err) goto out; BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_dirblock(handle, dir, bh); if (unlikely(err)) goto out; return 0; out: if (err != -ENOENT) ext4_std_error(dir->i_sb, err); return err; } /* * Set directory link count to 1 if nlinks > EXT4_LINK_MAX, or if nlinks == 2 * since this indicates that nlinks count was previously 1 to avoid overflowing * the 16-bit i_links_count field on disk. Directories with i_nlink == 1 mean * that subdirectory link counts are not being maintained accurately. * * The caller has already checked for i_nlink overflow in case the DIR_LINK * feature is not enabled and returned -EMLINK. The is_dx() check is a proxy * for checking S_ISDIR(inode) (since the INODE_INDEX feature will not be set * on regular files) and to avoid creating huge/slow non-HTREE directories. */ static void ext4_inc_count(struct inode *inode) { inc_nlink(inode); if (is_dx(inode) && (inode->i_nlink > EXT4_LINK_MAX || inode->i_nlink == 2)) set_nlink(inode, 1); } /* * If a directory had nlink == 1, then we should let it be 1. This indicates * directory has >EXT4_LINK_MAX subdirs. */ static void ext4_dec_count(struct inode *inode) { if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) drop_nlink(inode); } /* * Add non-directory inode to a directory. On success, the inode reference is * consumed by dentry is instantiation. This is also indicated by clearing of * *inodep pointer. On failure, the caller is responsible for dropping the * inode reference in the safe context. */ static int ext4_add_nondir(handle_t *handle, struct dentry *dentry, struct inode **inodep) { struct inode *dir = d_inode(dentry->d_parent); struct inode *inode = *inodep; int err = ext4_add_entry(handle, dentry, inode); if (!err) { err = ext4_mark_inode_dirty(handle, inode); if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); d_instantiate_new(dentry, inode); *inodep = NULL; return err; } drop_nlink(inode); ext4_mark_inode_dirty(handle, inode); ext4_orphan_add(handle, inode); unlock_new_inode(inode); return err; } /* * By the time this is called, we already have created * the directory cache entry for the new file, but it * is so far negative - it has no inode. * * If the create succeeds, we fill in the inode information * with d_instantiate(). */ static int ext4_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { handle_t *handle; struct inode *inode; int err, credits, retries = 0; err = dquot_initialize(dir); if (err) return err; credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: inode = ext4_new_inode_start_handle(idmap, dir, mode, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); err = ext4_add_nondir(handle, dentry, &inode); if (!err) ext4_fc_track_create(handle, dentry); } if (handle) ext4_journal_stop(handle); if (!IS_ERR_OR_NULL(inode)) iput(inode); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; } static int ext4_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { handle_t *handle; struct inode *inode; int err, credits, retries = 0; err = dquot_initialize(dir); if (err) return err; credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: inode = ext4_new_inode_start_handle(idmap, dir, mode, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); err = PTR_ERR(inode); if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &ext4_special_inode_operations; err = ext4_add_nondir(handle, dentry, &inode); if (!err) ext4_fc_track_create(handle, dentry); } if (handle) ext4_journal_stop(handle); if (!IS_ERR_OR_NULL(inode)) iput(inode); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; } static int ext4_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { handle_t *handle; struct inode *inode; int err, retries = 0; err = dquot_initialize(dir); if (err) return err; retry: inode = ext4_new_inode_start_handle(idmap, dir, mode, NULL, 0, NULL, EXT4_HT_DIR, EXT4_MAXQUOTAS_TRANS_BLOCKS(dir->i_sb) + 4 + EXT4_XATTR_TRANS_BLOCKS); handle = ext4_journal_current_handle(); err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); d_tmpfile(file, inode); err = ext4_orphan_add(handle, inode); if (err) goto err_unlock_inode; mark_inode_dirty(inode); unlock_new_inode(inode); } if (handle) ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return finish_open_simple(file, err); err_unlock_inode: ext4_journal_stop(handle); unlock_new_inode(inode); return err; } struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, struct ext4_dir_entry_2 *de, int blocksize, int csum_size, unsigned int parent_ino, int dotdot_real_len) { de->inode = cpu_to_le32(inode->i_ino); de->name_len = 1; de->rec_len = ext4_rec_len_to_disk(ext4_dir_rec_len(de->name_len, NULL), blocksize); strcpy(de->name, "."); ext4_set_de_type(inode->i_sb, de, S_IFDIR); de = ext4_next_entry(de, blocksize); de->inode = cpu_to_le32(parent_ino); de->name_len = 2; if (!dotdot_real_len) de->rec_len = ext4_rec_len_to_disk(blocksize - (csum_size + ext4_dir_rec_len(1, NULL)), blocksize); else de->rec_len = ext4_rec_len_to_disk( ext4_dir_rec_len(de->name_len, NULL), blocksize); strcpy(de->name, ".."); ext4_set_de_type(inode->i_sb, de, S_IFDIR); return ext4_next_entry(de, blocksize); } int ext4_init_new_dir(handle_t *handle, struct inode *dir, struct inode *inode) { struct buffer_head *dir_block = NULL; struct ext4_dir_entry_2 *de; ext4_lblk_t block = 0; unsigned int blocksize = dir->i_sb->s_blocksize; int csum_size = 0; int err; if (ext4_has_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { err = ext4_try_create_inline_dir(handle, dir, inode); if (err < 0 && err != -ENOSPC) goto out; if (!err) goto out; } inode->i_size = 0; dir_block = ext4_append(handle, inode, &block); if (IS_ERR(dir_block)) return PTR_ERR(dir_block); de = (struct ext4_dir_entry_2 *)dir_block->b_data; ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0); set_nlink(inode, 2); if (csum_size) ext4_initialize_dirent_tail(dir_block, blocksize); BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_dirblock(handle, inode, dir_block); if (err) goto out; set_buffer_verified(dir_block); out: brelse(dir_block); return err; } static int ext4_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { handle_t *handle; struct inode *inode; int err, err2 = 0, credits, retries = 0; if (EXT4_DIR_LINK_MAX(dir)) return -EMLINK; err = dquot_initialize(dir); if (err) return err; credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: inode = ext4_new_inode_start_handle(idmap, dir, S_IFDIR | mode, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; inode->i_op = &ext4_dir_inode_operations; inode->i_fop = &ext4_dir_operations; err = ext4_init_new_dir(handle, dir, inode); if (err) goto out_clear_inode; err = ext4_mark_inode_dirty(handle, inode); if (!err) err = ext4_add_entry(handle, dentry, inode); if (err) { out_clear_inode: clear_nlink(inode); ext4_orphan_add(handle, inode); unlock_new_inode(inode); err2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(err2)) err = err2; ext4_journal_stop(handle); iput(inode); goto out_retry; } ext4_inc_count(dir); ext4_update_dx_flag(dir); err = ext4_mark_inode_dirty(handle, dir); if (err) goto out_clear_inode; d_instantiate_new(dentry, inode); ext4_fc_track_create(handle, dentry); if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); out_stop: if (handle) ext4_journal_stop(handle); out_retry: if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; } /* * routine to check that the specified directory is empty (for rmdir) */ bool ext4_empty_dir(struct inode *inode) { unsigned int offset; struct buffer_head *bh; struct ext4_dir_entry_2 *de; struct super_block *sb; if (ext4_has_inline_data(inode)) { int has_inline_data = 1; int ret; ret = empty_inline_dir(inode, &has_inline_data); if (has_inline_data) return ret; } sb = inode->i_sb; if (inode->i_size < ext4_dir_rec_len(1, NULL) + ext4_dir_rec_len(2, NULL)) { EXT4_ERROR_INODE(inode, "invalid size"); return false; } bh = ext4_read_dirblock(inode, 0, EITHER); if (IS_ERR(bh)) return false; de = (struct ext4_dir_entry_2 *) bh->b_data; if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, 0) || le32_to_cpu(de->inode) != inode->i_ino || strcmp(".", de->name)) { ext4_warning_inode(inode, "directory missing '.'"); brelse(bh); return false; } offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); de = ext4_next_entry(de, sb->s_blocksize); if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, offset) || le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) { ext4_warning_inode(inode, "directory missing '..'"); brelse(bh); return false; } offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); while (offset < inode->i_size) { if (!(offset & (sb->s_blocksize - 1))) { unsigned int lblock; brelse(bh); lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb); bh = ext4_read_dirblock(inode, lblock, EITHER); if (bh == NULL) { offset += sb->s_blocksize; continue; } if (IS_ERR(bh)) return false; } de = (struct ext4_dir_entry_2 *) (bh->b_data + (offset & (sb->s_blocksize - 1))); if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, offset) || le32_to_cpu(de->inode)) { brelse(bh); return false; } offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); } brelse(bh); return true; } static int ext4_rmdir(struct inode *dir, struct dentry *dentry) { int retval; struct inode *inode; struct buffer_head *bh; struct ext4_dir_entry_2 *de; handle_t *handle = NULL; if (unlikely(ext4_forced_shutdown(dir->i_sb))) return -EIO; /* Initialize quotas before so that eventual writes go in * separate transaction */ retval = dquot_initialize(dir); if (retval) return retval; retval = dquot_initialize(d_inode(dentry)); if (retval) return retval; retval = -ENOENT; bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); if (IS_ERR(bh)) return PTR_ERR(bh); if (!bh) goto end_rmdir; inode = d_inode(dentry); retval = -EFSCORRUPTED; if (le32_to_cpu(de->inode) != inode->i_ino) goto end_rmdir; retval = -ENOTEMPTY; if (!ext4_empty_dir(inode)) goto end_rmdir; handle = ext4_journal_start(dir, EXT4_HT_DIR, EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) { retval = PTR_ERR(handle); handle = NULL; goto end_rmdir; } if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); retval = ext4_delete_entry(handle, dir, de, bh); if (retval) goto end_rmdir; if (!EXT4_DIR_LINK_EMPTY(inode)) ext4_warning_inode(inode, "empty directory '%.*s' has too many links (%u)", dentry->d_name.len, dentry->d_name.name, inode->i_nlink); inode_inc_iversion(inode); clear_nlink(inode); /* There's no need to set i_disksize: the fact that i_nlink is * zero will ensure that the right thing happens during any * recovery. */ inode->i_size = 0; ext4_orphan_add(handle, inode); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); inode_set_ctime_current(inode); retval = ext4_mark_inode_dirty(handle, inode); if (retval) goto end_rmdir; ext4_dec_count(dir); ext4_update_dx_flag(dir); ext4_fc_track_unlink(handle, dentry); retval = ext4_mark_inode_dirty(handle, dir); /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid * invalidating the dentries here, alongside with returning the * negative dentries at ext4_lookup(), when it is better * supported by the VFS for the CI case. */ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) d_invalidate(dentry); end_rmdir: brelse(bh); if (handle) ext4_journal_stop(handle); return retval; } int __ext4_unlink(struct inode *dir, const struct qstr *d_name, struct inode *inode, struct dentry *dentry /* NULL during fast_commit recovery */) { int retval = -ENOENT; struct buffer_head *bh; struct ext4_dir_entry_2 *de; handle_t *handle; int skip_remove_dentry = 0; /* * Keep this outside the transaction; it may have to set up the * directory's encryption key, which isn't GFP_NOFS-safe. */ bh = ext4_find_entry(dir, d_name, &de, NULL); if (IS_ERR(bh)) return PTR_ERR(bh); if (!bh) return -ENOENT; if (le32_to_cpu(de->inode) != inode->i_ino) { /* * It's okay if we find dont find dentry which matches * the inode. That's because it might have gotten * renamed to a different inode number */ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) skip_remove_dentry = 1; else goto out_bh; } handle = ext4_journal_start(dir, EXT4_HT_DIR, EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) { retval = PTR_ERR(handle); goto out_bh; } if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); if (!skip_remove_dentry) { retval = ext4_delete_entry(handle, dir, de, bh); if (retval) goto out_handle; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); ext4_update_dx_flag(dir); retval = ext4_mark_inode_dirty(handle, dir); if (retval) goto out_handle; } else { retval = 0; } if (inode->i_nlink == 0) ext4_warning_inode(inode, "Deleting file '%.*s' with no links", d_name->len, d_name->name); else drop_nlink(inode); if (!inode->i_nlink) ext4_orphan_add(handle, inode); inode_set_ctime_current(inode); retval = ext4_mark_inode_dirty(handle, inode); if (dentry && !retval) ext4_fc_track_unlink(handle, dentry); out_handle: ext4_journal_stop(handle); out_bh: brelse(bh); return retval; } static int ext4_unlink(struct inode *dir, struct dentry *dentry) { int retval; if (unlikely(ext4_forced_shutdown(dir->i_sb))) return -EIO; trace_ext4_unlink_enter(dir, dentry); /* * Initialize quotas before so that eventual writes go * in separate transaction */ retval = dquot_initialize(dir); if (retval) goto out_trace; retval = dquot_initialize(d_inode(dentry)); if (retval) goto out_trace; retval = __ext4_unlink(dir, &dentry->d_name, d_inode(dentry), dentry); /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid * invalidating the dentries here, alongside with returning the * negative dentries at ext4_lookup(), when it is better * supported by the VFS for the CI case. */ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) d_invalidate(dentry); out_trace: trace_ext4_unlink_exit(dentry, retval); return retval; } static int ext4_init_symlink_block(handle_t *handle, struct inode *inode, struct fscrypt_str *disk_link) { struct buffer_head *bh; char *kaddr; int err = 0; bh = ext4_bread(handle, inode, 0, EXT4_GET_BLOCKS_CREATE); if (IS_ERR(bh)) return PTR_ERR(bh); BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (err) goto out; kaddr = (char *)bh->b_data; memcpy(kaddr, disk_link->name, disk_link->len); inode->i_size = disk_link->len - 1; EXT4_I(inode)->i_disksize = inode->i_size; err = ext4_handle_dirty_metadata(handle, inode, bh); out: brelse(bh); return err; } static int ext4_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { handle_t *handle; struct inode *inode; int err, len = strlen(symname); int credits; struct fscrypt_str disk_link; int retries = 0; if (unlikely(ext4_forced_shutdown(dir->i_sb))) return -EIO; err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize, &disk_link); if (err) return err; err = dquot_initialize(dir); if (err) return err; /* * EXT4_INDEX_EXTRA_TRANS_BLOCKS for addition of entry into the * directory. +3 for inode, inode bitmap, group descriptor allocation. * EXT4_DATA_TRANS_BLOCKS for the data block allocation and * modification. */ credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3; retry: inode = ext4_new_inode_start_handle(idmap, dir, S_IFLNK|S_IRWXUGO, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); if (IS_ERR(inode)) { if (handle) ext4_journal_stop(handle); err = PTR_ERR(inode); goto out_retry; } if (IS_ENCRYPTED(inode)) { err = fscrypt_encrypt_symlink(inode, symname, len, &disk_link); if (err) goto err_drop_inode; inode->i_op = &ext4_encrypted_symlink_inode_operations; } else { if ((disk_link.len > EXT4_N_BLOCKS * 4)) { inode->i_op = &ext4_symlink_inode_operations; } else { inode->i_op = &ext4_fast_symlink_inode_operations; inode->i_link = (char *)&EXT4_I(inode)->i_data; } } if ((disk_link.len > EXT4_N_BLOCKS * 4)) { /* alloc symlink block and fill it */ err = ext4_init_symlink_block(handle, inode, &disk_link); if (err) goto err_drop_inode; } else { /* clear the extent format for fast symlink */ ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); memcpy((char *)&EXT4_I(inode)->i_data, disk_link.name, disk_link.len); inode->i_size = disk_link.len - 1; EXT4_I(inode)->i_disksize = inode->i_size; } err = ext4_add_nondir(handle, dentry, &inode); if (handle) ext4_journal_stop(handle); iput(inode); goto out_retry; err_drop_inode: clear_nlink(inode); ext4_mark_inode_dirty(handle, inode); ext4_orphan_add(handle, inode); unlock_new_inode(inode); if (handle) ext4_journal_stop(handle); iput(inode); out_retry: if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; if (disk_link.name != (unsigned char *)symname) kfree(disk_link.name); return err; } int __ext4_link(struct inode *dir, struct inode *inode, struct dentry *dentry) { handle_t *handle; int err, retries = 0; retry: handle = ext4_journal_start(dir, EXT4_HT_DIR, (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS) + 1); if (IS_ERR(handle)) return PTR_ERR(handle); if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); inode_set_ctime_current(inode); ext4_inc_count(inode); ihold(inode); err = ext4_add_entry(handle, dentry, inode); if (!err) { err = ext4_mark_inode_dirty(handle, inode); /* this can happen only for tmpfile being * linked the first time */ if (inode->i_nlink == 1) ext4_orphan_del(handle, inode); d_instantiate(dentry, inode); ext4_fc_track_link(handle, dentry); } else { drop_nlink(inode); iput(inode); } ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; } static int ext4_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(old_dentry); int err; if (inode->i_nlink >= EXT4_LINK_MAX) return -EMLINK; err = fscrypt_prepare_link(old_dentry, dir, dentry); if (err) return err; if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) && (!projid_eq(EXT4_I(dir)->i_projid, EXT4_I(old_dentry->d_inode)->i_projid))) return -EXDEV; err = dquot_initialize(dir); if (err) return err; return __ext4_link(dir, inode, dentry); } /* * Try to find buffer head where contains the parent block. * It should be the inode block if it is inlined or the 1st block * if it is a normal dir. */ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, struct inode *inode, int *retval, struct ext4_dir_entry_2 **parent_de, int *inlined) { struct buffer_head *bh; if (!ext4_has_inline_data(inode)) { struct ext4_dir_entry_2 *de; unsigned int offset; bh = ext4_read_dirblock(inode, 0, EITHER); if (IS_ERR(bh)) { *retval = PTR_ERR(bh); return NULL; } de = (struct ext4_dir_entry_2 *) bh->b_data; if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, 0) || le32_to_cpu(de->inode) != inode->i_ino || strcmp(".", de->name)) { EXT4_ERROR_INODE(inode, "directory missing '.'"); brelse(bh); *retval = -EFSCORRUPTED; return NULL; } offset = ext4_rec_len_from_disk(de->rec_len, inode->i_sb->s_blocksize); de = ext4_next_entry(de, inode->i_sb->s_blocksize); if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, offset) || le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) { EXT4_ERROR_INODE(inode, "directory missing '..'"); brelse(bh); *retval = -EFSCORRUPTED; return NULL; } *parent_de = de; return bh; } *inlined = 1; return ext4_get_first_inline_block(inode, parent_de, retval); } struct ext4_renament { struct inode *dir; struct dentry *dentry; struct inode *inode; bool is_dir; int dir_nlink_delta; /* entry for "dentry" */ struct buffer_head *bh; struct ext4_dir_entry_2 *de; int inlined; /* entry for ".." in inode if it's a directory */ struct buffer_head *dir_bh; struct ext4_dir_entry_2 *parent_de; int dir_inlined; }; static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent, bool is_cross) { int retval; ent->is_dir = true; if (!is_cross) return 0; ent->dir_bh = ext4_get_first_dir_block(handle, ent->inode, &retval, &ent->parent_de, &ent->dir_inlined); if (!ent->dir_bh) return retval; if (le32_to_cpu(ent->parent_de->inode) != ent->dir->i_ino) return -EFSCORRUPTED; BUFFER_TRACE(ent->dir_bh, "get_write_access"); return ext4_journal_get_write_access(handle, ent->dir->i_sb, ent->dir_bh, EXT4_JTR_NONE); } static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent, unsigned dir_ino) { int retval; if (!ent->dir_bh) return 0; ent->parent_de->inode = cpu_to_le32(dir_ino); BUFFER_TRACE(ent->dir_bh, "call ext4_handle_dirty_metadata"); if (!ent->dir_inlined) { if (is_dx(ent->inode)) { retval = ext4_handle_dirty_dx_node(handle, ent->inode, ent->dir_bh); } else { retval = ext4_handle_dirty_dirblock(handle, ent->inode, ent->dir_bh); } } else { retval = ext4_mark_inode_dirty(handle, ent->inode); } if (retval) { ext4_std_error(ent->dir->i_sb, retval); return retval; } return 0; } static int ext4_setent(handle_t *handle, struct ext4_renament *ent, unsigned ino, unsigned file_type) { int retval, retval2; BUFFER_TRACE(ent->bh, "get write access"); retval = ext4_journal_get_write_access(handle, ent->dir->i_sb, ent->bh, EXT4_JTR_NONE); if (retval) return retval; ent->de->inode = cpu_to_le32(ino); if (ext4_has_feature_filetype(ent->dir->i_sb)) ent->de->file_type = file_type; inode_inc_iversion(ent->dir); inode_set_mtime_to_ts(ent->dir, inode_set_ctime_current(ent->dir)); retval = ext4_mark_inode_dirty(handle, ent->dir); BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata"); if (!ent->inlined) { retval2 = ext4_handle_dirty_dirblock(handle, ent->dir, ent->bh); if (unlikely(retval2)) { ext4_std_error(ent->dir->i_sb, retval2); return retval2; } } return retval; } static void ext4_resetent(handle_t *handle, struct ext4_renament *ent, unsigned ino, unsigned file_type) { struct ext4_renament old = *ent; int retval = 0; /* * old->de could have moved from under us during make indexed dir, * so the old->de may no longer valid and need to find it again * before reset old inode info. */ old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, &old.inlined); if (IS_ERR(old.bh)) retval = PTR_ERR(old.bh); if (!old.bh) retval = -ENOENT; if (retval) { ext4_std_error(old.dir->i_sb, retval); return; } ext4_setent(handle, &old, ino, file_type); brelse(old.bh); } static int ext4_find_delete_entry(handle_t *handle, struct inode *dir, const struct qstr *d_name) { int retval = -ENOENT; struct buffer_head *bh; struct ext4_dir_entry_2 *de; bh = ext4_find_entry(dir, d_name, &de, NULL); if (IS_ERR(bh)) return PTR_ERR(bh); if (bh) { retval = ext4_delete_entry(handle, dir, de, bh); brelse(bh); } return retval; } static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent, int force_reread) { int retval; /* * ent->de could have moved from under us during htree split, so make * sure that we are deleting the right entry. We might also be pointing * to a stale entry in the unused part of ent->bh so just checking inum * and the name isn't enough. */ if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino || ent->de->name_len != ent->dentry->d_name.len || strncmp(ent->de->name, ent->dentry->d_name.name, ent->de->name_len) || force_reread) { retval = ext4_find_delete_entry(handle, ent->dir, &ent->dentry->d_name); } else { retval = ext4_delete_entry(handle, ent->dir, ent->de, ent->bh); if (retval == -ENOENT) { retval = ext4_find_delete_entry(handle, ent->dir, &ent->dentry->d_name); } } if (retval) { ext4_warning_inode(ent->dir, "Deleting old file: nlink %d, error=%d", ent->dir->i_nlink, retval); } } static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent) { if (ent->dir_nlink_delta) { if (ent->dir_nlink_delta == -1) ext4_dec_count(ent->dir); else ext4_inc_count(ent->dir); ext4_mark_inode_dirty(handle, ent->dir); } } static struct inode *ext4_whiteout_for_rename(struct mnt_idmap *idmap, struct ext4_renament *ent, int credits, handle_t **h) { struct inode *wh; handle_t *handle; int retries = 0; /* * for inode block, sb block, group summaries, * and inode bitmap */ credits += (EXT4_MAXQUOTAS_TRANS_BLOCKS(ent->dir->i_sb) + EXT4_XATTR_TRANS_BLOCKS + 4); retry: wh = ext4_new_inode_start_handle(idmap, ent->dir, S_IFCHR | WHITEOUT_MODE, &ent->dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); if (IS_ERR(wh)) { if (handle) ext4_journal_stop(handle); if (PTR_ERR(wh) == -ENOSPC && ext4_should_retry_alloc(ent->dir->i_sb, &retries)) goto retry; } else { *h = handle; init_special_inode(wh, wh->i_mode, WHITEOUT_DEV); wh->i_op = &ext4_special_inode_operations; } return wh; } /* * Anybody can rename anything with this: the permission checks are left to the * higher-level routines. * * n.b. old_{dentry,inode) refers to the source dentry/inode * while new_{dentry,inode) refers to the destination dentry/inode * This comes from rename(const char *oldpath, const char *newpath) */ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { handle_t *handle = NULL; struct ext4_renament old = { .dir = old_dir, .dentry = old_dentry, .inode = d_inode(old_dentry), }; struct ext4_renament new = { .dir = new_dir, .dentry = new_dentry, .inode = d_inode(new_dentry), }; int force_reread; int retval; struct inode *whiteout = NULL; int credits; u8 old_file_type; if (new.inode && new.inode->i_nlink == 0) { EXT4_ERROR_INODE(new.inode, "target of rename is already freed"); return -EFSCORRUPTED; } if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT)) && (!projid_eq(EXT4_I(new_dir)->i_projid, EXT4_I(old_dentry->d_inode)->i_projid))) return -EXDEV; retval = dquot_initialize(old.dir); if (retval) return retval; retval = dquot_initialize(old.inode); if (retval) return retval; retval = dquot_initialize(new.dir); if (retval) return retval; /* Initialize quotas before so that eventual writes go * in separate transaction */ if (new.inode) { retval = dquot_initialize(new.inode); if (retval) return retval; } old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, &old.inlined); if (IS_ERR(old.bh)) return PTR_ERR(old.bh); /* * Check for inode number is _not_ due to possible IO errors. * We might rmdir the source, keep it as pwd of some process * and merrily kill the link to whatever was created under the * same name. Goodbye sticky bit ;-< */ retval = -ENOENT; if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino) goto release_bh; new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, &new.de, &new.inlined); if (IS_ERR(new.bh)) { retval = PTR_ERR(new.bh); new.bh = NULL; goto release_bh; } if (new.bh) { if (!new.inode) { brelse(new.bh); new.bh = NULL; } } if (new.inode && !test_opt(new.dir->i_sb, NO_AUTO_DA_ALLOC)) ext4_alloc_da_blocks(old.inode); credits = (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2); if (!(flags & RENAME_WHITEOUT)) { handle = ext4_journal_start(old.dir, EXT4_HT_DIR, credits); if (IS_ERR(handle)) { retval = PTR_ERR(handle); goto release_bh; } } else { whiteout = ext4_whiteout_for_rename(idmap, &old, credits, &handle); if (IS_ERR(whiteout)) { retval = PTR_ERR(whiteout); goto release_bh; } } old_file_type = old.de->file_type; if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir)) ext4_handle_sync(handle); if (S_ISDIR(old.inode->i_mode)) { if (new.inode) { retval = -ENOTEMPTY; if (!ext4_empty_dir(new.inode)) goto end_rename; } else { retval = -EMLINK; if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir)) goto end_rename; } retval = ext4_rename_dir_prepare(handle, &old, new.dir != old.dir); if (retval) goto end_rename; } /* * If we're renaming a file within an inline_data dir and adding or * setting the new dirent causes a conversion from inline_data to * extents/blockmap, we need to force the dirent delete code to * re-read the directory, or else we end up trying to delete a dirent * from what is now the extent tree root (or a block map). */ force_reread = (new.dir->i_ino == old.dir->i_ino && ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA)); if (whiteout) { /* * Do this before adding a new entry, so the old entry is sure * to be still pointing to the valid old entry. */ retval = ext4_setent(handle, &old, whiteout->i_ino, EXT4_FT_CHRDEV); if (retval) goto end_rename; retval = ext4_mark_inode_dirty(handle, whiteout); if (unlikely(retval)) goto end_rename; } if (!new.bh) { retval = ext4_add_entry(handle, new.dentry, old.inode); if (retval) goto end_rename; } else { retval = ext4_setent(handle, &new, old.inode->i_ino, old_file_type); if (retval) goto end_rename; } if (force_reread) force_reread = !ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA); /* * Like most other Unix systems, set the ctime for inodes on a * rename. */ inode_set_ctime_current(old.inode); retval = ext4_mark_inode_dirty(handle, old.inode); if (unlikely(retval)) goto end_rename; if (!whiteout) { /* * ok, that's it */ ext4_rename_delete(handle, &old, force_reread); } if (new.inode) { ext4_dec_count(new.inode); inode_set_ctime_current(new.inode); } inode_set_mtime_to_ts(old.dir, inode_set_ctime_current(old.dir)); ext4_update_dx_flag(old.dir); if (old.is_dir) { retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); if (retval) goto end_rename; ext4_dec_count(old.dir); if (new.inode) { /* checked ext4_empty_dir above, can't have another * parent, ext4_dec_count() won't work for many-linked * dirs */ clear_nlink(new.inode); } else { ext4_inc_count(new.dir); ext4_update_dx_flag(new.dir); retval = ext4_mark_inode_dirty(handle, new.dir); if (unlikely(retval)) goto end_rename; } } retval = ext4_mark_inode_dirty(handle, old.dir); if (unlikely(retval)) goto end_rename; if (old.is_dir) { /* * We disable fast commits here that's because the * replay code is not yet capable of changing dot dot * dirents in directories. */ ext4_fc_mark_ineligible(old.inode->i_sb, EXT4_FC_REASON_RENAME_DIR, handle); } else { struct super_block *sb = old.inode->i_sb; if (new.inode) ext4_fc_track_unlink(handle, new.dentry); if (test_opt2(sb, JOURNAL_FAST_COMMIT) && !(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) && !(ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE))) { __ext4_fc_track_link(handle, old.inode, new.dentry); __ext4_fc_track_unlink(handle, old.inode, old.dentry); if (whiteout) __ext4_fc_track_create(handle, whiteout, old.dentry); } } if (new.inode) { retval = ext4_mark_inode_dirty(handle, new.inode); if (unlikely(retval)) goto end_rename; if (!new.inode->i_nlink) ext4_orphan_add(handle, new.inode); } retval = 0; end_rename: if (whiteout) { if (retval) { ext4_resetent(handle, &old, old.inode->i_ino, old_file_type); drop_nlink(whiteout); ext4_mark_inode_dirty(handle, whiteout); ext4_orphan_add(handle, whiteout); } unlock_new_inode(whiteout); ext4_journal_stop(handle); iput(whiteout); } else { ext4_journal_stop(handle); } release_bh: brelse(old.dir_bh); brelse(old.bh); brelse(new.bh); return retval; } static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { handle_t *handle = NULL; struct ext4_renament old = { .dir = old_dir, .dentry = old_dentry, .inode = d_inode(old_dentry), }; struct ext4_renament new = { .dir = new_dir, .dentry = new_dentry, .inode = d_inode(new_dentry), }; u8 new_file_type; int retval; if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) && !projid_eq(EXT4_I(new_dir)->i_projid, EXT4_I(old_dentry->d_inode)->i_projid)) || (ext4_test_inode_flag(old_dir, EXT4_INODE_PROJINHERIT) && !projid_eq(EXT4_I(old_dir)->i_projid, EXT4_I(new_dentry->d_inode)->i_projid))) return -EXDEV; retval = dquot_initialize(old.dir); if (retval) return retval; retval = dquot_initialize(new.dir); if (retval) return retval; old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, &old.inlined); if (IS_ERR(old.bh)) return PTR_ERR(old.bh); /* * Check for inode number is _not_ due to possible IO errors. * We might rmdir the source, keep it as pwd of some process * and merrily kill the link to whatever was created under the * same name. Goodbye sticky bit ;-< */ retval = -ENOENT; if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino) goto end_rename; new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, &new.de, &new.inlined); if (IS_ERR(new.bh)) { retval = PTR_ERR(new.bh); new.bh = NULL; goto end_rename; } /* RENAME_EXCHANGE case: old *and* new must both exist */ if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino) goto end_rename; handle = ext4_journal_start(old.dir, EXT4_HT_DIR, (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) + 2 * EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2)); if (IS_ERR(handle)) { retval = PTR_ERR(handle); handle = NULL; goto end_rename; } if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir)) ext4_handle_sync(handle); if (S_ISDIR(old.inode->i_mode)) { retval = ext4_rename_dir_prepare(handle, &old, new.dir != old.dir); if (retval) goto end_rename; } if (S_ISDIR(new.inode->i_mode)) { retval = ext4_rename_dir_prepare(handle, &new, new.dir != old.dir); if (retval) goto end_rename; } /* * Other than the special case of overwriting a directory, parents' * nlink only needs to be modified if this is a cross directory rename. */ if (old.dir != new.dir && old.is_dir != new.is_dir) { old.dir_nlink_delta = old.is_dir ? -1 : 1; new.dir_nlink_delta = -old.dir_nlink_delta; retval = -EMLINK; if ((old.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(old.dir)) || (new.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(new.dir))) goto end_rename; } new_file_type = new.de->file_type; retval = ext4_setent(handle, &new, old.inode->i_ino, old.de->file_type); if (retval) goto end_rename; retval = ext4_setent(handle, &old, new.inode->i_ino, new_file_type); if (retval) goto end_rename; /* * Like most other Unix systems, set the ctime for inodes on a * rename. */ inode_set_ctime_current(old.inode); inode_set_ctime_current(new.inode); retval = ext4_mark_inode_dirty(handle, old.inode); if (unlikely(retval)) goto end_rename; retval = ext4_mark_inode_dirty(handle, new.inode); if (unlikely(retval)) goto end_rename; ext4_fc_mark_ineligible(new.inode->i_sb, EXT4_FC_REASON_CROSS_RENAME, handle); if (old.dir_bh) { retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); if (retval) goto end_rename; } if (new.dir_bh) { retval = ext4_rename_dir_finish(handle, &new, old.dir->i_ino); if (retval) goto end_rename; } ext4_update_dir_count(handle, &old); ext4_update_dir_count(handle, &new); retval = 0; end_rename: brelse(old.dir_bh); brelse(new.dir_bh); brelse(old.bh); brelse(new.bh); if (handle) ext4_journal_stop(handle); return retval; } static int ext4_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { int err; if (unlikely(ext4_forced_shutdown(old_dir->i_sb))) return -EIO; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; err = fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry, flags); if (err) return err; if (flags & RENAME_EXCHANGE) { return ext4_cross_rename(old_dir, old_dentry, new_dir, new_dentry); } return ext4_rename(idmap, old_dir, old_dentry, new_dir, new_dentry, flags); } /* * directories can handle most operations... */ const struct inode_operations ext4_dir_inode_operations = { .create = ext4_create, .lookup = ext4_lookup, .link = ext4_link, .unlink = ext4_unlink, .symlink = ext4_symlink, .mkdir = ext4_mkdir, .rmdir = ext4_rmdir, .mknod = ext4_mknod, .tmpfile = ext4_tmpfile, .rename = ext4_rename2, .setattr = ext4_setattr, .getattr = ext4_getattr, .listxattr = ext4_listxattr, .get_inode_acl = ext4_get_acl, .set_acl = ext4_set_acl, .fiemap = ext4_fiemap, .fileattr_get = ext4_fileattr_get, .fileattr_set = ext4_fileattr_set, }; const struct inode_operations ext4_special_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_getattr, .listxattr = ext4_listxattr, .get_inode_acl = ext4_get_acl, .set_acl = ext4_set_acl, }; |
| 19 19 19 8 19 19 6 14 19 2 19 19 16 5 19 6 19 3 19 19 16 19 6 2 2 2 2 2 2 17 17 19 19 19 19 19 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 | // SPDX-License-Identifier: GPL-2.0 /* * * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. * */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> #include "debug.h" #include "ntfs_fs.h" // clang-format off /* Src buffer is zero. */ #define LZNT_ERROR_ALL_ZEROS 1 #define LZNT_CHUNK_SIZE 0x1000 // clang-format on struct lznt_hash { const u8 *p1; const u8 *p2; }; struct lznt { const u8 *unc; const u8 *unc_end; const u8 *best_match; size_t max_len; bool std; struct lznt_hash hash[LZNT_CHUNK_SIZE]; }; static inline size_t get_match_len(const u8 *ptr, const u8 *end, const u8 *prev, size_t max_len) { size_t len = 0; while (ptr + len < end && ptr[len] == prev[len] && ++len < max_len) ; return len; } static size_t longest_match_std(const u8 *src, struct lznt *ctx) { size_t hash_index; size_t len1 = 0, len2 = 0; const u8 **hash; hash_index = ((40543U * ((((src[0] << 4) ^ src[1]) << 4) ^ src[2])) >> 4) & (LZNT_CHUNK_SIZE - 1); hash = &(ctx->hash[hash_index].p1); if (hash[0] >= ctx->unc && hash[0] < src && hash[0][0] == src[0] && hash[0][1] == src[1] && hash[0][2] == src[2]) { len1 = 3; if (ctx->max_len > 3) len1 += get_match_len(src + 3, ctx->unc_end, hash[0] + 3, ctx->max_len - 3); } if (hash[1] >= ctx->unc && hash[1] < src && hash[1][0] == src[0] && hash[1][1] == src[1] && hash[1][2] == src[2]) { len2 = 3; if (ctx->max_len > 3) len2 += get_match_len(src + 3, ctx->unc_end, hash[1] + 3, ctx->max_len - 3); } /* Compare two matches and select the best one. */ if (len1 < len2) { ctx->best_match = hash[1]; len1 = len2; } else { ctx->best_match = hash[0]; } hash[1] = hash[0]; hash[0] = src; return len1; } static size_t longest_match_best(const u8 *src, struct lznt *ctx) { size_t max_len; const u8 *ptr; if (ctx->unc >= src || !ctx->max_len) return 0; max_len = 0; for (ptr = ctx->unc; ptr < src; ++ptr) { size_t len = get_match_len(src, ctx->unc_end, ptr, ctx->max_len); if (len >= max_len) { max_len = len; ctx->best_match = ptr; } } return max_len >= 3 ? max_len : 0; } static const size_t s_max_len[] = { 0x1002, 0x802, 0x402, 0x202, 0x102, 0x82, 0x42, 0x22, 0x12, }; static const size_t s_max_off[] = { 0x10, 0x20, 0x40, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000, }; static inline u16 make_pair(size_t offset, size_t len, size_t index) { return ((offset - 1) << (12 - index)) | ((len - 3) & (((1 << (12 - index)) - 1))); } static inline size_t parse_pair(u16 pair, size_t *offset, size_t index) { *offset = 1 + (pair >> (12 - index)); return 3 + (pair & ((1 << (12 - index)) - 1)); } /* * compress_chunk * * Return: * * 0 - Ok, @cmpr contains @cmpr_chunk_size bytes of compressed data. * * 1 - Input buffer is full zero. * * -2 - The compressed buffer is too small to hold the compressed data. */ static inline int compress_chunk(size_t (*match)(const u8 *, struct lznt *), const u8 *unc, const u8 *unc_end, u8 *cmpr, u8 *cmpr_end, size_t *cmpr_chunk_size, struct lznt *ctx) { size_t cnt = 0; size_t idx = 0; const u8 *up = unc; u8 *cp = cmpr + 3; u8 *cp2 = cmpr + 2; u8 not_zero = 0; /* Control byte of 8-bit values: ( 0 - means byte as is, 1 - short pair ). */ u8 ohdr = 0; u8 *last; u16 t16; if (unc + LZNT_CHUNK_SIZE < unc_end) unc_end = unc + LZNT_CHUNK_SIZE; last = min(cmpr + LZNT_CHUNK_SIZE + sizeof(short), cmpr_end); ctx->unc = unc; ctx->unc_end = unc_end; ctx->max_len = s_max_len[0]; while (up < unc_end) { size_t max_len; while (unc + s_max_off[idx] < up) ctx->max_len = s_max_len[++idx]; /* Find match. */ max_len = up + 3 <= unc_end ? (*match)(up, ctx) : 0; if (!max_len) { if (cp >= last) goto NotCompressed; not_zero |= *cp++ = *up++; } else if (cp + 1 >= last) { goto NotCompressed; } else { t16 = make_pair(up - ctx->best_match, max_len, idx); *cp++ = t16; *cp++ = t16 >> 8; ohdr |= 1 << cnt; up += max_len; } cnt = (cnt + 1) & 7; if (!cnt) { *cp2 = ohdr; ohdr = 0; cp2 = cp; cp += 1; } } if (cp2 < last) *cp2 = ohdr; else cp -= 1; *cmpr_chunk_size = cp - cmpr; t16 = (*cmpr_chunk_size - 3) | 0xB000; cmpr[0] = t16; cmpr[1] = t16 >> 8; return not_zero ? 0 : LZNT_ERROR_ALL_ZEROS; NotCompressed: if ((cmpr + LZNT_CHUNK_SIZE + sizeof(short)) > last) return -2; /* * Copy non cmpr data. * 0x3FFF == ((LZNT_CHUNK_SIZE + 2 - 3) | 0x3000) */ cmpr[0] = 0xff; cmpr[1] = 0x3f; memcpy(cmpr + sizeof(short), unc, LZNT_CHUNK_SIZE); *cmpr_chunk_size = LZNT_CHUNK_SIZE + sizeof(short); return 0; } static inline ssize_t decompress_chunk(u8 *unc, u8 *unc_end, const u8 *cmpr, const u8 *cmpr_end) { u8 *up = unc; u8 ch = *cmpr++; size_t bit = 0; size_t index = 0; u16 pair; size_t offset, length; /* Do decompression until pointers are inside range. */ while (up < unc_end && cmpr < cmpr_end) { // return err if more than LZNT_CHUNK_SIZE bytes are written if (up - unc > LZNT_CHUNK_SIZE) return -EINVAL; /* Correct index */ while (unc + s_max_off[index] < up) index += 1; /* Check the current flag for zero. */ if (!(ch & (1 << bit))) { /* Just copy byte. */ *up++ = *cmpr++; goto next; } /* Check for boundary. */ if (cmpr + 1 >= cmpr_end) return -EINVAL; /* Read a short from little endian stream. */ pair = cmpr[1]; pair <<= 8; pair |= cmpr[0]; cmpr += 2; /* Translate packed information into offset and length. */ length = parse_pair(pair, &offset, index); /* Check offset for boundary. */ if (unc + offset > up) return -EINVAL; /* Truncate the length if necessary. */ if (up + length >= unc_end) length = unc_end - up; /* Now we copy bytes. This is the heart of LZ algorithm. */ for (; length > 0; length--, up++) *up = *(up - offset); next: /* Advance flag bit value. */ bit = (bit + 1) & 7; if (!bit) { if (cmpr >= cmpr_end) break; ch = *cmpr++; } } /* Return the size of uncompressed data. */ return up - unc; } /* * get_lznt_ctx * @level: 0 - Standard compression. * !0 - Best compression, requires a lot of cpu. */ struct lznt *get_lznt_ctx(int level) { struct lznt *r = kzalloc(level ? offsetof(struct lznt, hash) : sizeof(struct lznt), GFP_NOFS); if (r) r->std = !level; return r; } /* * compress_lznt - Compresses @unc into @cmpr * * Return: * * +x - Ok, @cmpr contains 'final_compressed_size' bytes of compressed data. * * 0 - Input buffer is full zero. */ size_t compress_lznt(const void *unc, size_t unc_size, void *cmpr, size_t cmpr_size, struct lznt *ctx) { int err; size_t (*match)(const u8 *src, struct lznt *ctx); u8 *p = cmpr; u8 *end = p + cmpr_size; const u8 *unc_chunk = unc; const u8 *unc_end = unc_chunk + unc_size; bool is_zero = true; if (ctx->std) { match = &longest_match_std; memset(ctx->hash, 0, sizeof(ctx->hash)); } else { match = &longest_match_best; } /* Compression cycle. */ for (; unc_chunk < unc_end; unc_chunk += LZNT_CHUNK_SIZE) { cmpr_size = 0; err = compress_chunk(match, unc_chunk, unc_end, p, end, &cmpr_size, ctx); if (err < 0) return unc_size; if (is_zero && err != LZNT_ERROR_ALL_ZEROS) is_zero = false; p += cmpr_size; } if (p <= end - 2) p[0] = p[1] = 0; return is_zero ? 0 : PtrOffset(cmpr, p); } /* * decompress_lznt - Decompress @cmpr into @unc. */ ssize_t decompress_lznt(const void *cmpr, size_t cmpr_size, void *unc, size_t unc_size) { const u8 *cmpr_chunk = cmpr; const u8 *cmpr_end = cmpr_chunk + cmpr_size; u8 *unc_chunk = unc; u8 *unc_end = unc_chunk + unc_size; u16 chunk_hdr; if (cmpr_size < sizeof(short)) return -EINVAL; /* Read chunk header. */ chunk_hdr = cmpr_chunk[1]; chunk_hdr <<= 8; chunk_hdr |= cmpr_chunk[0]; /* Loop through decompressing chunks. */ for (;;) { size_t chunk_size_saved; size_t unc_use; size_t cmpr_use = 3 + (chunk_hdr & (LZNT_CHUNK_SIZE - 1)); /* Check that the chunk actually fits the supplied buffer. */ if (cmpr_chunk + cmpr_use > cmpr_end) return -EINVAL; /* First make sure the chunk contains compressed data. */ if (chunk_hdr & 0x8000) { /* Decompress a chunk and return if we get an error. */ ssize_t err = decompress_chunk(unc_chunk, unc_end, cmpr_chunk + sizeof(chunk_hdr), cmpr_chunk + cmpr_use); if (err < 0) return err; unc_use = err; } else { /* This chunk does not contain compressed data. */ unc_use = unc_chunk + LZNT_CHUNK_SIZE > unc_end ? unc_end - unc_chunk : LZNT_CHUNK_SIZE; if (cmpr_chunk + sizeof(chunk_hdr) + unc_use > cmpr_end) { return -EINVAL; } memcpy(unc_chunk, cmpr_chunk + sizeof(chunk_hdr), unc_use); } /* Advance pointers. */ cmpr_chunk += cmpr_use; unc_chunk += unc_use; /* Check for the end of unc buffer. */ if (unc_chunk >= unc_end) break; /* Proceed the next chunk. */ if (cmpr_chunk > cmpr_end - 2) break; chunk_size_saved = LZNT_CHUNK_SIZE; /* Read chunk header. */ chunk_hdr = cmpr_chunk[1]; chunk_hdr <<= 8; chunk_hdr |= cmpr_chunk[0]; if (!chunk_hdr) break; /* Check the size of unc buffer. */ if (unc_use < chunk_size_saved) { size_t t1 = chunk_size_saved - unc_use; u8 *t2 = unc_chunk + t1; /* 'Zero' memory. */ if (t2 >= unc_end) break; memset(unc_chunk, 0, t1); unc_chunk = t2; } } /* Check compression boundary. */ if (cmpr_chunk > cmpr_end) return -EINVAL; /* * The unc size is just a difference between current * pointer and original one. */ return PtrOffset(unc, unc_chunk); } |
| 309 343 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_ERRCODE_H #define _BCACHEFS_ERRCODE_H #define BCH_ERRCODES() \ x(ERANGE, ERANGE_option_too_small) \ x(ERANGE, ERANGE_option_too_big) \ x(EINVAL, mount_option) \ x(BCH_ERR_mount_option, option_name) \ x(BCH_ERR_mount_option, option_value) \ x(BCH_ERR_mount_option, option_not_bool) \ x(ENOMEM, ENOMEM_stripe_buf) \ x(ENOMEM, ENOMEM_replicas_table) \ x(ENOMEM, ENOMEM_cpu_replicas) \ x(ENOMEM, ENOMEM_replicas_gc) \ x(ENOMEM, ENOMEM_disk_groups_validate) \ x(ENOMEM, ENOMEM_disk_groups_to_cpu) \ x(ENOMEM, ENOMEM_mark_snapshot) \ x(ENOMEM, ENOMEM_mark_stripe) \ x(ENOMEM, ENOMEM_mark_stripe_ptr) \ x(ENOMEM, ENOMEM_btree_key_cache_create) \ x(ENOMEM, ENOMEM_btree_key_cache_fill) \ x(ENOMEM, ENOMEM_btree_key_cache_insert) \ x(ENOMEM, ENOMEM_trans_kmalloc) \ x(ENOMEM, ENOMEM_trans_log_msg) \ x(ENOMEM, ENOMEM_do_encrypt) \ x(ENOMEM, ENOMEM_ec_read_extent) \ x(ENOMEM, ENOMEM_ec_stripe_mem_alloc) \ x(ENOMEM, ENOMEM_ec_new_stripe_alloc) \ x(ENOMEM, ENOMEM_fs_btree_cache_init) \ x(ENOMEM, ENOMEM_fs_btree_key_cache_init) \ x(ENOMEM, ENOMEM_fs_counters_init) \ x(ENOMEM, ENOMEM_fs_btree_write_buffer_init) \ x(ENOMEM, ENOMEM_io_clock_init) \ x(ENOMEM, ENOMEM_blacklist_table_init) \ x(ENOMEM, ENOMEM_sb_realloc_injected) \ x(ENOMEM, ENOMEM_sb_bio_realloc) \ x(ENOMEM, ENOMEM_sb_buf_realloc) \ x(ENOMEM, ENOMEM_sb_journal_validate) \ x(ENOMEM, ENOMEM_sb_journal_v2_validate) \ x(ENOMEM, ENOMEM_journal_entry_add) \ x(ENOMEM, ENOMEM_journal_read_buf_realloc) \ x(ENOMEM, ENOMEM_btree_interior_update_worker_init)\ x(ENOMEM, ENOMEM_btree_interior_update_pool_init) \ x(ENOMEM, ENOMEM_bio_read_init) \ x(ENOMEM, ENOMEM_bio_read_split_init) \ x(ENOMEM, ENOMEM_bio_write_init) \ x(ENOMEM, ENOMEM_bio_bounce_pages_init) \ x(ENOMEM, ENOMEM_writepage_bioset_init) \ x(ENOMEM, ENOMEM_dio_read_bioset_init) \ x(ENOMEM, ENOMEM_dio_write_bioset_init) \ x(ENOMEM, ENOMEM_nocow_flush_bioset_init) \ x(ENOMEM, ENOMEM_promote_table_init) \ x(ENOMEM, ENOMEM_compression_bounce_read_init) \ x(ENOMEM, ENOMEM_compression_bounce_write_init) \ x(ENOMEM, ENOMEM_compression_workspace_init) \ x(ENOMEM, ENOMEM_decompression_workspace_init) \ x(ENOMEM, ENOMEM_bucket_gens) \ x(ENOMEM, ENOMEM_buckets_nouse) \ x(ENOMEM, ENOMEM_usage_init) \ x(ENOMEM, ENOMEM_btree_node_read_all_replicas) \ x(ENOMEM, ENOMEM_btree_node_reclaim) \ x(ENOMEM, ENOMEM_btree_node_mem_alloc) \ x(ENOMEM, ENOMEM_btree_cache_cannibalize_lock) \ x(ENOMEM, ENOMEM_buckets_waiting_for_journal_init)\ x(ENOMEM, ENOMEM_buckets_waiting_for_journal_set) \ x(ENOMEM, ENOMEM_set_nr_journal_buckets) \ x(ENOMEM, ENOMEM_dev_journal_init) \ x(ENOMEM, ENOMEM_journal_pin_fifo) \ x(ENOMEM, ENOMEM_journal_buf) \ x(ENOMEM, ENOMEM_gc_start) \ x(ENOMEM, ENOMEM_gc_alloc_start) \ x(ENOMEM, ENOMEM_gc_reflink_start) \ x(ENOMEM, ENOMEM_gc_gens) \ x(ENOMEM, ENOMEM_gc_repair_key) \ x(ENOMEM, ENOMEM_fsck_extent_ends_at) \ x(ENOMEM, ENOMEM_fsck_add_nlink) \ x(ENOMEM, ENOMEM_journal_key_insert) \ x(ENOMEM, ENOMEM_journal_keys_sort) \ x(ENOMEM, ENOMEM_read_superblock_clean) \ x(ENOMEM, ENOMEM_fs_alloc) \ x(ENOMEM, ENOMEM_fs_name_alloc) \ x(ENOMEM, ENOMEM_fs_other_alloc) \ x(ENOMEM, ENOMEM_dev_alloc) \ x(ENOMEM, ENOMEM_disk_accounting) \ x(ENOMEM, ENOMEM_stripe_head_alloc) \ x(ENOMEM, ENOMEM_journal_read_bucket) \ x(ENOSPC, ENOSPC_disk_reservation) \ x(ENOSPC, ENOSPC_bucket_alloc) \ x(ENOSPC, ENOSPC_disk_label_add) \ x(ENOSPC, ENOSPC_stripe_create) \ x(ENOSPC, ENOSPC_inode_create) \ x(ENOSPC, ENOSPC_str_hash_create) \ x(ENOSPC, ENOSPC_snapshot_create) \ x(ENOSPC, ENOSPC_subvolume_create) \ x(ENOSPC, ENOSPC_sb) \ x(ENOSPC, ENOSPC_sb_journal) \ x(ENOSPC, ENOSPC_sb_journal_seq_blacklist) \ x(ENOSPC, ENOSPC_sb_quota) \ x(ENOSPC, ENOSPC_sb_replicas) \ x(ENOSPC, ENOSPC_sb_members) \ x(ENOSPC, ENOSPC_sb_members_v2) \ x(ENOSPC, ENOSPC_sb_crypt) \ x(ENOSPC, ENOSPC_sb_downgrade) \ x(ENOSPC, ENOSPC_btree_slot) \ x(ENOSPC, ENOSPC_snapshot_tree) \ x(ENOENT, ENOENT_bkey_type_mismatch) \ x(ENOENT, ENOENT_str_hash_lookup) \ x(ENOENT, ENOENT_str_hash_set_must_replace) \ x(ENOENT, ENOENT_inode) \ x(ENOENT, ENOENT_not_subvol) \ x(ENOENT, ENOENT_not_directory) \ x(ENOENT, ENOENT_directory_dead) \ x(ENOENT, ENOENT_subvolume) \ x(ENOENT, ENOENT_snapshot_tree) \ x(ENOENT, ENOENT_dirent_doesnt_match_inode) \ x(ENOENT, ENOENT_dev_not_found) \ x(ENOENT, ENOENT_dev_idx_not_found) \ x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \ x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \ x(EEXIST, EEXIST_str_hash_set) \ x(EEXIST, EEXIST_discard_in_flight_add) \ x(EEXIST, EEXIST_subvolume_create) \ x(ENOSPC, open_buckets_empty) \ x(ENOSPC, freelist_empty) \ x(BCH_ERR_freelist_empty, no_buckets_found) \ x(0, transaction_restart) \ x(BCH_ERR_transaction_restart, transaction_restart_fault_inject) \ x(BCH_ERR_transaction_restart, transaction_restart_relock) \ x(BCH_ERR_transaction_restart, transaction_restart_relock_path) \ x(BCH_ERR_transaction_restart, transaction_restart_relock_path_intent) \ x(BCH_ERR_transaction_restart, transaction_restart_relock_after_fill) \ x(BCH_ERR_transaction_restart, transaction_restart_too_many_iters) \ x(BCH_ERR_transaction_restart, transaction_restart_lock_node_reused) \ x(BCH_ERR_transaction_restart, transaction_restart_fill_relock) \ x(BCH_ERR_transaction_restart, transaction_restart_fill_mem_alloc_fail)\ x(BCH_ERR_transaction_restart, transaction_restart_mem_realloced) \ x(BCH_ERR_transaction_restart, transaction_restart_in_traverse_all) \ x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock) \ x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock_write)\ x(BCH_ERR_transaction_restart, transaction_restart_deadlock_recursion_limit)\ x(BCH_ERR_transaction_restart, transaction_restart_upgrade) \ x(BCH_ERR_transaction_restart, transaction_restart_key_cache_upgrade) \ x(BCH_ERR_transaction_restart, transaction_restart_key_cache_fill) \ x(BCH_ERR_transaction_restart, transaction_restart_key_cache_raced) \ x(BCH_ERR_transaction_restart, transaction_restart_key_cache_realloced)\ x(BCH_ERR_transaction_restart, transaction_restart_journal_preres_get) \ x(BCH_ERR_transaction_restart, transaction_restart_split_race) \ x(BCH_ERR_transaction_restart, transaction_restart_write_buffer_flush) \ x(BCH_ERR_transaction_restart, transaction_restart_nested) \ x(0, no_btree_node) \ x(BCH_ERR_no_btree_node, no_btree_node_relock) \ x(BCH_ERR_no_btree_node, no_btree_node_upgrade) \ x(BCH_ERR_no_btree_node, no_btree_node_drop) \ x(BCH_ERR_no_btree_node, no_btree_node_lock_root) \ x(BCH_ERR_no_btree_node, no_btree_node_up) \ x(BCH_ERR_no_btree_node, no_btree_node_down) \ x(BCH_ERR_no_btree_node, no_btree_node_init) \ x(BCH_ERR_no_btree_node, no_btree_node_cached) \ x(BCH_ERR_no_btree_node, no_btree_node_srcu_reset) \ x(0, btree_insert_fail) \ x(BCH_ERR_btree_insert_fail, btree_insert_btree_node_full) \ x(BCH_ERR_btree_insert_fail, btree_insert_need_mark_replicas) \ x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \ x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \ x(0, backpointer_to_overwritten_btree_node) \ x(0, lock_fail_root_changed) \ x(0, journal_reclaim_would_deadlock) \ x(EINVAL, fsck) \ x(BCH_ERR_fsck, fsck_fix) \ x(BCH_ERR_fsck, fsck_delete_bkey) \ x(BCH_ERR_fsck, fsck_ignore) \ x(BCH_ERR_fsck, fsck_errors_not_fixed) \ x(BCH_ERR_fsck, fsck_repair_unimplemented) \ x(BCH_ERR_fsck, fsck_repair_impossible) \ x(0, restart_recovery) \ x(0, data_update_done) \ x(EINVAL, device_state_not_allowed) \ x(EINVAL, member_info_missing) \ x(EINVAL, mismatched_block_size) \ x(EINVAL, block_size_too_small) \ x(EINVAL, bucket_size_too_small) \ x(EINVAL, device_size_too_small) \ x(EINVAL, device_size_too_big) \ x(EINVAL, device_not_a_member_of_filesystem) \ x(EINVAL, device_has_been_removed) \ x(EINVAL, device_splitbrain) \ x(EINVAL, device_already_online) \ x(EINVAL, insufficient_devices_to_start) \ x(EINVAL, invalid) \ x(EINVAL, internal_fsck_err) \ x(EINVAL, opt_parse_error) \ x(EINVAL, remove_with_metadata_missing_unimplemented)\ x(EINVAL, remove_would_lose_data) \ x(EINVAL, btree_iter_with_journal_not_supported) \ x(EROFS, erofs_trans_commit) \ x(EROFS, erofs_no_writes) \ x(EROFS, erofs_journal_err) \ x(EROFS, erofs_sb_err) \ x(EROFS, erofs_unfixed_errors) \ x(EROFS, erofs_norecovery) \ x(EROFS, erofs_nochanges) \ x(EROFS, insufficient_devices) \ x(0, operation_blocked) \ x(BCH_ERR_operation_blocked, btree_cache_cannibalize_lock_blocked) \ x(BCH_ERR_operation_blocked, journal_res_get_blocked) \ x(BCH_ERR_operation_blocked, journal_preres_get_blocked) \ x(BCH_ERR_operation_blocked, bucket_alloc_blocked) \ x(BCH_ERR_operation_blocked, stripe_alloc_blocked) \ x(BCH_ERR_invalid, invalid_sb) \ x(BCH_ERR_invalid_sb, invalid_sb_magic) \ x(BCH_ERR_invalid_sb, invalid_sb_version) \ x(BCH_ERR_invalid_sb, invalid_sb_features) \ x(BCH_ERR_invalid_sb, invalid_sb_too_big) \ x(BCH_ERR_invalid_sb, invalid_sb_csum_type) \ x(BCH_ERR_invalid_sb, invalid_sb_csum) \ x(BCH_ERR_invalid_sb, invalid_sb_block_size) \ x(BCH_ERR_invalid_sb, invalid_sb_uuid) \ x(BCH_ERR_invalid_sb, invalid_sb_too_many_members) \ x(BCH_ERR_invalid_sb, invalid_sb_dev_idx) \ x(BCH_ERR_invalid_sb, invalid_sb_time_precision) \ x(BCH_ERR_invalid_sb, invalid_sb_field_size) \ x(BCH_ERR_invalid_sb, invalid_sb_layout) \ x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_type) \ x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_nr_superblocks) \ x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_superblocks_overlap) \ x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_sb_max_size_bits) \ x(BCH_ERR_invalid_sb, invalid_sb_members_missing) \ x(BCH_ERR_invalid_sb, invalid_sb_members) \ x(BCH_ERR_invalid_sb, invalid_sb_disk_groups) \ x(BCH_ERR_invalid_sb, invalid_sb_replicas) \ x(BCH_ERR_invalid_sb, invalid_replicas_entry) \ x(BCH_ERR_invalid_sb, invalid_sb_journal) \ x(BCH_ERR_invalid_sb, invalid_sb_journal_seq_blacklist) \ x(BCH_ERR_invalid_sb, invalid_sb_crypt) \ x(BCH_ERR_invalid_sb, invalid_sb_clean) \ x(BCH_ERR_invalid_sb, invalid_sb_quota) \ x(BCH_ERR_invalid_sb, invalid_sb_errors) \ x(BCH_ERR_invalid_sb, invalid_sb_opt_compression) \ x(BCH_ERR_invalid_sb, invalid_sb_ext) \ x(BCH_ERR_invalid_sb, invalid_sb_downgrade) \ x(BCH_ERR_invalid, invalid_bkey) \ x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ x(EIO, btree_node_read_err) \ x(EIO, sb_not_downgraded) \ x(EIO, btree_node_write_all_failed) \ x(EIO, btree_node_read_error) \ x(EIO, btree_node_read_validate_error) \ x(EIO, btree_need_topology_repair) \ x(EIO, bucket_ref_update) \ x(EIO, trigger_pointer) \ x(EIO, trigger_stripe_pointer) \ x(EIO, metadata_bucket_inconsistency) \ x(EIO, mark_stripe) \ x(EIO, stripe_reconstruct) \ x(EIO, key_type_error) \ x(EIO, no_device_to_read_from) \ x(EIO, missing_indirect_extent) \ x(EIO, invalidate_stripe_to_dev) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_bad_node) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_incompatible) \ x(0, nopromote) \ x(BCH_ERR_nopromote, nopromote_may_not) \ x(BCH_ERR_nopromote, nopromote_already_promoted) \ x(BCH_ERR_nopromote, nopromote_unwritten) \ x(BCH_ERR_nopromote, nopromote_congested) \ x(BCH_ERR_nopromote, nopromote_in_flight) \ x(BCH_ERR_nopromote, nopromote_no_writes) \ x(BCH_ERR_nopromote, nopromote_enomem) \ x(0, invalid_snapshot_node) \ x(0, option_needs_open_fs) \ x(0, remove_disk_accounting_entry) enum bch_errcode { BCH_ERR_START = 2048, #define x(class, err) BCH_ERR_##err, BCH_ERRCODES() #undef x BCH_ERR_MAX }; const char *bch2_err_str(int); bool __bch2_err_matches(int, int); static inline bool _bch2_err_matches(int err, int class) { return err < 0 && __bch2_err_matches(err, class); } #define bch2_err_matches(_err, _class) \ ({ \ BUILD_BUG_ON(!__builtin_constant_p(_class)); \ unlikely(_bch2_err_matches(_err, _class)); \ }) int __bch2_err_class(int); static inline long bch2_err_class(long err) { return err < 0 ? __bch2_err_class(err) : err; } #define BLK_STS_REMOVED ((__force blk_status_t)128) const char *bch2_blk_status_to_str(blk_status_t); #endif /* _BCACHFES_ERRCODE_H */ |
| 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TTY_DRIVER_H #define _LINUX_TTY_DRIVER_H #include <linux/export.h> #include <linux/fs.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/cdev.h> #include <linux/uaccess.h> #include <linux/termios.h> #include <linux/seq_file.h> struct tty_struct; struct tty_driver; struct serial_icounter_struct; struct serial_struct; /** * struct tty_operations -- interface between driver and tty * * @lookup: ``struct tty_struct *()(struct tty_driver *self, struct file *, * int idx)`` * * Return the tty device corresponding to @idx, %NULL if there is not * one currently in use and an %ERR_PTR value on error. Called under * %tty_mutex (for now!) * * Optional method. Default behaviour is to use the @self->ttys array. * * @install: ``int ()(struct tty_driver *self, struct tty_struct *tty)`` * * Install a new @tty into the @self's internal tables. Used in * conjunction with @lookup and @remove methods. * * Optional method. Default behaviour is to use the @self->ttys array. * * @remove: ``void ()(struct tty_driver *self, struct tty_struct *tty)`` * * Remove a closed @tty from the @self's internal tables. Used in * conjunction with @lookup and @remove methods. * * Optional method. Default behaviour is to use the @self->ttys array. * * @open: ``int ()(struct tty_struct *tty, struct file *)`` * * This routine is called when a particular @tty device is opened. This * routine is mandatory; if this routine is not filled in, the attempted * open will fail with %ENODEV. * * Required method. Called with tty lock held. May sleep. * * @close: ``void ()(struct tty_struct *tty, struct file *)`` * * This routine is called when a particular @tty device is closed. At the * point of return from this call the driver must make no further ldisc * calls of any kind. * * Remark: called even if the corresponding @open() failed. * * Required method. Called with tty lock held. May sleep. * * @shutdown: ``void ()(struct tty_struct *tty)`` * * This routine is called under the tty lock when a particular @tty device * is closed for the last time. It executes before the @tty resources * are freed so may execute while another function holds a @tty kref. * * @cleanup: ``void ()(struct tty_struct *tty)`` * * This routine is called asynchronously when a particular @tty device * is closed for the last time freeing up the resources. This is * actually the second part of shutdown for routines that might sleep. * * @write: ``ssize_t ()(struct tty_struct *tty, const u8 *buf, size_t count)`` * * This routine is called by the kernel to write a series (@count) of * characters (@buf) to the @tty device. The characters may come from * user space or kernel space. This routine will return the * number of characters actually accepted for writing. * * May occur in parallel in special cases. Because this includes panic * paths drivers generally shouldn't try and do clever locking here. * * Optional: Required for writable devices. May not sleep. * * @put_char: ``int ()(struct tty_struct *tty, u8 ch)`` * * This routine is called by the kernel to write a single character @ch to * the @tty device. If the kernel uses this routine, it must call the * @flush_chars() routine (if defined) when it is done stuffing characters * into the driver. If there is no room in the queue, the character is * ignored. * * Optional: Kernel will use the @write method if not provided. Do not * call this function directly, call tty_put_char(). * * @flush_chars: ``void ()(struct tty_struct *tty)`` * * This routine is called by the kernel after it has written a * series of characters to the tty device using @put_char(). * * Optional. Do not call this function directly, call * tty_driver_flush_chars(). * * @write_room: ``unsigned int ()(struct tty_struct *tty)`` * * This routine returns the numbers of characters the @tty driver * will accept for queuing to be written. This number is subject * to change as output buffers get emptied, or if the output flow * control is acted. * * The ldisc is responsible for being intelligent about multi-threading of * write_room/write calls * * Required if @write method is provided else not needed. Do not call this * function directly, call tty_write_room() * * @chars_in_buffer: ``unsigned int ()(struct tty_struct *tty)`` * * This routine returns the number of characters in the device private * output queue. Used in tty_wait_until_sent() and for poll() * implementation. * * Optional: if not provided, it is assumed there is no queue on the * device. Do not call this function directly, call tty_chars_in_buffer(). * * @ioctl: ``int ()(struct tty_struct *tty, unsigned int cmd, * unsigned long arg)`` * * This routine allows the @tty driver to implement device-specific * ioctls. If the ioctl number passed in @cmd is not recognized by the * driver, it should return %ENOIOCTLCMD. * * Optional. * * @compat_ioctl: ``long ()(struct tty_struct *tty, unsigned int cmd, * unsigned long arg)`` * * Implement ioctl processing for 32 bit process on 64 bit system. * * Optional. * * @set_termios: ``void ()(struct tty_struct *tty, const struct ktermios *old)`` * * This routine allows the @tty driver to be notified when device's * termios settings have changed. New settings are in @tty->termios. * Previous settings are passed in the @old argument. * * The API is defined such that the driver should return the actual modes * selected. This means that the driver is responsible for modifying any * bits in @tty->termios it cannot fulfill to indicate the actual modes * being used. * * Optional. Called under the @tty->termios_rwsem. May sleep. * * @ldisc_ok: ``int ()(struct tty_struct *tty, int ldisc)`` * * This routine allows the @tty driver to decide if it can deal * with a particular @ldisc. * * Optional. Called under the @tty->ldisc_sem and @tty->termios_rwsem. * * @set_ldisc: ``void ()(struct tty_struct *tty)`` * * This routine allows the @tty driver to be notified when the device's * line discipline is being changed. At the point this is done the * discipline is not yet usable. * * Optional. Called under the @tty->ldisc_sem and @tty->termios_rwsem. * * @throttle: ``void ()(struct tty_struct *tty)`` * * This routine notifies the @tty driver that input buffers for the line * discipline are close to full, and it should somehow signal that no more * characters should be sent to the @tty. * * Serialization including with @unthrottle() is the job of the ldisc * layer. * * Optional: Always invoke via tty_throttle_safe(). Called under the * @tty->termios_rwsem. * * @unthrottle: ``void ()(struct tty_struct *tty)`` * * This routine notifies the @tty driver that it should signal that * characters can now be sent to the @tty without fear of overrunning the * input buffers of the line disciplines. * * Optional. Always invoke via tty_unthrottle(). Called under the * @tty->termios_rwsem. * * @stop: ``void ()(struct tty_struct *tty)`` * * This routine notifies the @tty driver that it should stop outputting * characters to the tty device. * * Called with @tty->flow.lock held. Serialized with @start() method. * * Optional. Always invoke via stop_tty(). * * @start: ``void ()(struct tty_struct *tty)`` * * This routine notifies the @tty driver that it resumed sending * characters to the @tty device. * * Called with @tty->flow.lock held. Serialized with stop() method. * * Optional. Always invoke via start_tty(). * * @hangup: ``void ()(struct tty_struct *tty)`` * * This routine notifies the @tty driver that it should hang up the @tty * device. * * Optional. Called with tty lock held. * * @break_ctl: ``int ()(struct tty_struct *tty, int state)`` * * This optional routine requests the @tty driver to turn on or off BREAK * status on the RS-232 port. If @state is -1, then the BREAK status * should be turned on; if @state is 0, then BREAK should be turned off. * * If this routine is implemented, the high-level tty driver will handle * the following ioctls: %TCSBRK, %TCSBRKP, %TIOCSBRK, %TIOCCBRK. * * If the driver sets %TTY_DRIVER_HARDWARE_BREAK in tty_alloc_driver(), * then the interface will also be called with actual times and the * hardware is expected to do the delay work itself. 0 and -1 are still * used for on/off. * * Optional: Required for %TCSBRK/%BRKP/etc. handling. May sleep. * * @flush_buffer: ``void ()(struct tty_struct *tty)`` * * This routine discards device private output buffer. Invoked on close, * hangup, to implement %TCOFLUSH ioctl and similar. * * Optional: if not provided, it is assumed there is no queue on the * device. Do not call this function directly, call * tty_driver_flush_buffer(). * * @wait_until_sent: ``void ()(struct tty_struct *tty, int timeout)`` * * This routine waits until the device has written out all of the * characters in its transmitter FIFO. Or until @timeout (in jiffies) is * reached. * * Optional: If not provided, the device is assumed to have no FIFO. * Usually correct to invoke via tty_wait_until_sent(). May sleep. * * @send_xchar: ``void ()(struct tty_struct *tty, u8 ch)`` * * This routine is used to send a high-priority XON/XOFF character (@ch) * to the @tty device. * * Optional: If not provided, then the @write method is called under * the @tty->atomic_write_lock to keep it serialized with the ldisc. * * @tiocmget: ``int ()(struct tty_struct *tty)`` * * This routine is used to obtain the modem status bits from the @tty * driver. * * Optional: If not provided, then %ENOTTY is returned from the %TIOCMGET * ioctl. Do not call this function directly, call tty_tiocmget(). * * @tiocmset: ``int ()(struct tty_struct *tty, * unsigned int set, unsigned int clear)`` * * This routine is used to set the modem status bits to the @tty driver. * First, @clear bits should be cleared, then @set bits set. * * Optional: If not provided, then %ENOTTY is returned from the %TIOCMSET * ioctl. Do not call this function directly, call tty_tiocmset(). * * @resize: ``int ()(struct tty_struct *tty, struct winsize *ws)`` * * Called when a termios request is issued which changes the requested * terminal geometry to @ws. * * Optional: the default action is to update the termios structure * without error. This is usually the correct behaviour. Drivers should * not force errors here if they are not resizable objects (e.g. a serial * line). See tty_do_resize() if you need to wrap the standard method * in your own logic -- the usual case. * * @get_icount: ``int ()(struct tty_struct *tty, * struct serial_icounter *icount)`` * * Called when the @tty device receives a %TIOCGICOUNT ioctl. Passed a * kernel structure @icount to complete. * * Optional: called only if provided, otherwise %ENOTTY will be returned. * * @get_serial: ``int ()(struct tty_struct *tty, struct serial_struct *p)`` * * Called when the @tty device receives a %TIOCGSERIAL ioctl. Passed a * kernel structure @p (&struct serial_struct) to complete. * * Optional: called only if provided, otherwise %ENOTTY will be returned. * Do not call this function directly, call tty_tiocgserial(). * * @set_serial: ``int ()(struct tty_struct *tty, struct serial_struct *p)`` * * Called when the @tty device receives a %TIOCSSERIAL ioctl. Passed a * kernel structure @p (&struct serial_struct) to set the values from. * * Optional: called only if provided, otherwise %ENOTTY will be returned. * Do not call this function directly, call tty_tiocsserial(). * * @show_fdinfo: ``void ()(struct tty_struct *tty, struct seq_file *m)`` * * Called when the @tty device file descriptor receives a fdinfo request * from VFS (to show in /proc/<pid>/fdinfo/). @m should be filled with * information. * * Optional: called only if provided, otherwise nothing is written to @m. * Do not call this function directly, call tty_show_fdinfo(). * * @poll_init: ``int ()(struct tty_driver *driver, int line, char *options)`` * * kgdboc support (Documentation/dev-tools/kgdb.rst). This routine is * called to initialize the HW for later use by calling @poll_get_char or * @poll_put_char. * * Optional: called only if provided, otherwise skipped as a non-polling * driver. * * @poll_get_char: ``int ()(struct tty_driver *driver, int line)`` * * kgdboc support (see @poll_init). @driver should read a character from a * tty identified by @line and return it. * * Optional: called only if @poll_init provided. * * @poll_put_char: ``void ()(struct tty_driver *driver, int line, char ch)`` * * kgdboc support (see @poll_init). @driver should write character @ch to * a tty identified by @line. * * Optional: called only if @poll_init provided. * * @proc_show: ``int ()(struct seq_file *m, void *driver)`` * * Driver @driver (cast to &struct tty_driver) can show additional info in * /proc/tty/driver/<driver_name>. It is enough to fill in the information * into @m. * * Optional: called only if provided, otherwise no /proc entry created. * * This structure defines the interface between the low-level tty driver and * the tty routines. These routines can be defined. Unless noted otherwise, * they are optional, and can be filled in with a %NULL pointer. */ struct tty_operations { struct tty_struct * (*lookup)(struct tty_driver *driver, struct file *filp, int idx); int (*install)(struct tty_driver *driver, struct tty_struct *tty); void (*remove)(struct tty_driver *driver, struct tty_struct *tty); int (*open)(struct tty_struct * tty, struct file * filp); void (*close)(struct tty_struct * tty, struct file * filp); void (*shutdown)(struct tty_struct *tty); void (*cleanup)(struct tty_struct *tty); ssize_t (*write)(struct tty_struct *tty, const u8 *buf, size_t count); int (*put_char)(struct tty_struct *tty, u8 ch); void (*flush_chars)(struct tty_struct *tty); unsigned int (*write_room)(struct tty_struct *tty); unsigned int (*chars_in_buffer)(struct tty_struct *tty); int (*ioctl)(struct tty_struct *tty, unsigned int cmd, unsigned long arg); long (*compat_ioctl)(struct tty_struct *tty, unsigned int cmd, unsigned long arg); void (*set_termios)(struct tty_struct *tty, const struct ktermios *old); void (*throttle)(struct tty_struct * tty); void (*unthrottle)(struct tty_struct * tty); void (*stop)(struct tty_struct *tty); void (*start)(struct tty_struct *tty); void (*hangup)(struct tty_struct *tty); int (*break_ctl)(struct tty_struct *tty, int state); void (*flush_buffer)(struct tty_struct *tty); int (*ldisc_ok)(struct tty_struct *tty, int ldisc); void (*set_ldisc)(struct tty_struct *tty); void (*wait_until_sent)(struct tty_struct *tty, int timeout); void (*send_xchar)(struct tty_struct *tty, u8 ch); int (*tiocmget)(struct tty_struct *tty); int (*tiocmset)(struct tty_struct *tty, unsigned int set, unsigned int clear); int (*resize)(struct tty_struct *tty, struct winsize *ws); int (*get_icount)(struct tty_struct *tty, struct serial_icounter_struct *icount); int (*get_serial)(struct tty_struct *tty, struct serial_struct *p); int (*set_serial)(struct tty_struct *tty, struct serial_struct *p); void (*show_fdinfo)(struct tty_struct *tty, struct seq_file *m); #ifdef CONFIG_CONSOLE_POLL int (*poll_init)(struct tty_driver *driver, int line, char *options); int (*poll_get_char)(struct tty_driver *driver, int line); void (*poll_put_char)(struct tty_driver *driver, int line, char ch); #endif int (*proc_show)(struct seq_file *m, void *driver); } __randomize_layout; /** * struct tty_driver -- driver for TTY devices * * @kref: reference counting. Reaching zero frees all the internals and the * driver. * @cdevs: allocated/registered character /dev devices * @owner: modules owning this driver. Used drivers cannot be rmmod'ed. * Automatically set by tty_alloc_driver(). * @driver_name: name of the driver used in /proc/tty * @name: used for constructing /dev node name * @name_base: used as a number base for constructing /dev node name * @major: major /dev device number (zero for autoassignment) * @minor_start: the first minor /dev device number * @num: number of devices allocated * @type: type of tty driver (%TTY_DRIVER_TYPE_) * @subtype: subtype of tty driver (%SYSTEM_TYPE_, %PTY_TYPE_, %SERIAL_TYPE_) * @init_termios: termios to set to each tty initially (e.g. %tty_std_termios) * @flags: tty driver flags (%TTY_DRIVER_) * @proc_entry: proc fs entry, used internally * @other: driver of the linked tty; only used for the PTY driver * @ttys: array of active &struct tty_struct, set by tty_standard_install() * @ports: array of &struct tty_port; can be set during initialization by * tty_port_link_device() and similar * @termios: storage for termios at each TTY close for the next open * @driver_state: pointer to driver's arbitrary data * @ops: driver hooks for TTYs. Set them using tty_set_operations(). Use &struct * tty_port helpers in them as much as possible. * @tty_drivers: used internally to link tty_drivers together * * The usual handling of &struct tty_driver is to allocate it by * tty_alloc_driver(), set up all the necessary members, and register it by * tty_register_driver(). At last, the driver is torn down by calling * tty_unregister_driver() followed by tty_driver_kref_put(). * * The fields required to be set before calling tty_register_driver() include * @driver_name, @name, @type, @subtype, @init_termios, and @ops. */ struct tty_driver { struct kref kref; struct cdev **cdevs; struct module *owner; const char *driver_name; const char *name; int name_base; int major; int minor_start; unsigned int num; short type; short subtype; struct ktermios init_termios; unsigned long flags; struct proc_dir_entry *proc_entry; struct tty_driver *other; /* * Pointer to the tty data structures */ struct tty_struct **ttys; struct tty_port **ports; struct ktermios **termios; void *driver_state; /* * Driver methods */ const struct tty_operations *ops; struct list_head tty_drivers; } __randomize_layout; extern struct list_head tty_drivers; struct tty_driver *__tty_alloc_driver(unsigned int lines, struct module *owner, unsigned long flags); struct tty_driver *tty_find_polling_driver(char *name, int *line); void tty_driver_kref_put(struct tty_driver *driver); /* Use TTY_DRIVER_* flags below */ #define tty_alloc_driver(lines, flags) \ __tty_alloc_driver(lines, THIS_MODULE, flags) static inline struct tty_driver *tty_driver_kref_get(struct tty_driver *d) { kref_get(&d->kref); return d; } static inline void tty_set_operations(struct tty_driver *driver, const struct tty_operations *op) { driver->ops = op; } /** * DOC: TTY Driver Flags * * TTY_DRIVER_RESET_TERMIOS * Requests the tty layer to reset the termios setting when the last * process has closed the device. Used for PTYs, in particular. * * TTY_DRIVER_REAL_RAW * Indicates that the driver will guarantee not to set any special * character handling flags if this is set for the tty: * * ``(IGNBRK || (!BRKINT && !PARMRK)) && (IGNPAR || !INPCK)`` * * That is, if there is no reason for the driver to * send notifications of parity and break characters up to the line * driver, it won't do so. This allows the line driver to optimize for * this case if this flag is set. (Note that there is also a promise, if * the above case is true, not to signal overruns, either.) * * TTY_DRIVER_DYNAMIC_DEV * The individual tty devices need to be registered with a call to * tty_register_device() when the device is found in the system and * unregistered with a call to tty_unregister_device() so the devices will * be show up properly in sysfs. If not set, all &tty_driver.num entries * will be created by the tty core in sysfs when tty_register_driver() is * called. This is to be used by drivers that have tty devices that can * appear and disappear while the main tty driver is registered with the * tty core. * * TTY_DRIVER_DEVPTS_MEM * Don't use the standard arrays (&tty_driver.ttys and * &tty_driver.termios), instead use dynamic memory keyed through the * devpts filesystem. This is only applicable to the PTY driver. * * TTY_DRIVER_HARDWARE_BREAK * Hardware handles break signals. Pass the requested timeout to the * &tty_operations.break_ctl instead of using a simple on/off interface. * * TTY_DRIVER_DYNAMIC_ALLOC * Do not allocate structures which are needed per line for this driver * (&tty_driver.ports) as it would waste memory. The driver will take * care. This is only applicable to the PTY driver. * * TTY_DRIVER_UNNUMBERED_NODE * Do not create numbered ``/dev`` nodes. For example, create * ``/dev/ttyprintk`` and not ``/dev/ttyprintk0``. Applicable only when a * driver for a single tty device is being allocated. */ #define TTY_DRIVER_INSTALLED 0x0001 #define TTY_DRIVER_RESET_TERMIOS 0x0002 #define TTY_DRIVER_REAL_RAW 0x0004 #define TTY_DRIVER_DYNAMIC_DEV 0x0008 #define TTY_DRIVER_DEVPTS_MEM 0x0010 #define TTY_DRIVER_HARDWARE_BREAK 0x0020 #define TTY_DRIVER_DYNAMIC_ALLOC 0x0040 #define TTY_DRIVER_UNNUMBERED_NODE 0x0080 /* tty driver types */ #define TTY_DRIVER_TYPE_SYSTEM 0x0001 #define TTY_DRIVER_TYPE_CONSOLE 0x0002 #define TTY_DRIVER_TYPE_SERIAL 0x0003 #define TTY_DRIVER_TYPE_PTY 0x0004 #define TTY_DRIVER_TYPE_SCC 0x0005 /* scc driver */ #define TTY_DRIVER_TYPE_SYSCONS 0x0006 /* system subtypes (magic, used by tty_io.c) */ #define SYSTEM_TYPE_TTY 0x0001 #define SYSTEM_TYPE_CONSOLE 0x0002 #define SYSTEM_TYPE_SYSCONS 0x0003 #define SYSTEM_TYPE_SYSPTMX 0x0004 /* pty subtypes (magic, used by tty_io.c) */ #define PTY_TYPE_MASTER 0x0001 #define PTY_TYPE_SLAVE 0x0002 /* serial subtype definitions */ #define SERIAL_TYPE_NORMAL 1 int tty_register_driver(struct tty_driver *driver); void tty_unregister_driver(struct tty_driver *driver); struct device *tty_register_device(struct tty_driver *driver, unsigned index, struct device *dev); struct device *tty_register_device_attr(struct tty_driver *driver, unsigned index, struct device *device, void *drvdata, const struct attribute_group **attr_grp); void tty_unregister_device(struct tty_driver *driver, unsigned index); #ifdef CONFIG_PROC_FS void proc_tty_register_driver(struct tty_driver *); void proc_tty_unregister_driver(struct tty_driver *); #else static inline void proc_tty_register_driver(struct tty_driver *d) {} static inline void proc_tty_unregister_driver(struct tty_driver *d) {} #endif #endif /* #ifdef _LINUX_TTY_DRIVER_H */ |
| 4 4 3 1 10 10 1 9 9 1 1 6 1 7 1 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * Copyright (c) 2012-2013 Red Hat, Inc. * All rights reserved. */ #include "xfs.h" #include "xfs_shared.h" #include "xfs_fs.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" #include "xfs_dir2.h" #include "xfs_inode.h" #include "xfs_bmap.h" #include "xfs_bmap_btree.h" #include "xfs_quota.h" #include "xfs_symlink.h" #include "xfs_trans_space.h" #include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_ialloc.h" #include "xfs_error.h" #include "xfs_health.h" #include "xfs_symlink_remote.h" #include "xfs_parent.h" #include "xfs_defer.h" int xfs_readlink( struct xfs_inode *ip, char *link) { struct xfs_mount *mp = ip->i_mount; xfs_fsize_t pathlen; int error; trace_xfs_readlink(ip); if (xfs_is_shutdown(mp)) return -EIO; if (xfs_ifork_zapped(ip, XFS_DATA_FORK)) return -EIO; xfs_ilock(ip, XFS_ILOCK_SHARED); pathlen = ip->i_disk_size; if (!pathlen) goto out_corrupt; if (pathlen < 0 || pathlen > XFS_SYMLINK_MAXLEN) { xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)", __func__, (unsigned long long) ip->i_ino, (long long) pathlen); ASSERT(0); goto out_corrupt; } if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) { /* * The VFS crashes on a NULL pointer, so return -EFSCORRUPTED * if if_data is junk. */ if (XFS_IS_CORRUPT(ip->i_mount, !ip->i_df.if_data)) goto out_corrupt; memcpy(link, ip->i_df.if_data, pathlen + 1); error = 0; } else { error = xfs_symlink_remote_read(ip, link); } xfs_iunlock(ip, XFS_ILOCK_SHARED); return error; out_corrupt: xfs_iunlock(ip, XFS_ILOCK_SHARED); xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK); return -EFSCORRUPTED; } int xfs_symlink( struct mnt_idmap *idmap, struct xfs_inode *dp, struct xfs_name *link_name, const char *target_path, umode_t mode, struct xfs_inode **ipp) { struct xfs_mount *mp = dp->i_mount; struct xfs_icreate_args args = { .idmap = idmap, .pip = dp, .mode = S_IFLNK | (mode & ~S_IFMT), }; struct xfs_dir_update du = { .dp = dp, .name = link_name, }; struct xfs_trans *tp = NULL; int error = 0; int pathlen; bool unlock_dp_on_error = false; xfs_filblks_t fs_blocks; struct xfs_dquot *udqp; struct xfs_dquot *gdqp; struct xfs_dquot *pdqp; uint resblks; xfs_ino_t ino; *ipp = NULL; trace_xfs_symlink(dp, link_name); if (xfs_is_shutdown(mp)) return -EIO; /* * Check component lengths of the target path name. */ pathlen = strlen(target_path); if (pathlen >= XFS_SYMLINK_MAXLEN) /* total string too long */ return -ENAMETOOLONG; ASSERT(pathlen > 0); /* Make sure that we have allocated dquot(s) on disk. */ error = xfs_icreate_dqalloc(&args, &udqp, &gdqp, &pdqp); if (error) return error; /* * The symlink will fit into the inode data fork? * If there are no parent pointers, then there wont't be any attributes. * So we get the whole variable part, and do not need to reserve extra * blocks. Otherwise, we need to reserve the blocks. */ if (pathlen <= XFS_LITINO(mp) && !xfs_has_parent(mp)) fs_blocks = 0; else fs_blocks = xfs_symlink_blocks(mp, pathlen); resblks = xfs_symlink_space_res(mp, link_name->len, fs_blocks); error = xfs_parent_start(mp, &du.ppargs); if (error) goto out_release_dquots; error = xfs_trans_alloc_icreate(mp, &M_RES(mp)->tr_symlink, udqp, gdqp, pdqp, resblks, &tp); if (error) goto out_parent; xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = true; /* * Check whether the directory allows new symlinks or not. */ if (dp->i_diflags & XFS_DIFLAG_NOSYMLINKS) { error = -EPERM; goto out_trans_cancel; } /* * Allocate an inode for the symlink. */ error = xfs_dialloc(&tp, &args, &ino); if (!error) error = xfs_icreate(tp, ino, &args, &du.ip); if (error) goto out_trans_cancel; /* * Now we join the directory inode to the transaction. We do not do it * earlier because xfs_dir_ialloc might commit the previous transaction * (and release all the locks). An error from here on will result in * the transaction cancel unlocking dp so don't do it explicitly in the * error path. */ xfs_trans_ijoin(tp, dp, 0); /* * Also attach the dquot(s) to it, if applicable. */ xfs_qm_vop_create_dqattach(tp, du.ip, udqp, gdqp, pdqp); resblks -= XFS_IALLOC_SPACE_RES(mp); error = xfs_symlink_write_target(tp, du.ip, du.ip->i_ino, target_path, pathlen, fs_blocks, resblks); if (error) goto out_trans_cancel; resblks -= fs_blocks; i_size_write(VFS_I(du.ip), du.ip->i_disk_size); /* * Create the directory entry for the symlink. */ error = xfs_dir_create_child(tp, resblks, &du); if (error) goto out_trans_cancel; /* * If this is a synchronous mount, make sure that the * symlink transaction goes to disk before returning to * the user. */ if (xfs_has_wsync(mp) || xfs_has_dirsync(mp)) xfs_trans_set_sync(tp); error = xfs_trans_commit(tp); if (error) goto out_release_inode; xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); xfs_qm_dqrele(pdqp); *ipp = du.ip; xfs_iunlock(du.ip, XFS_ILOCK_EXCL); xfs_iunlock(dp, XFS_ILOCK_EXCL); xfs_parent_finish(mp, du.ppargs); return 0; out_trans_cancel: xfs_trans_cancel(tp); out_release_inode: /* * Wait until after the current transaction is aborted to finish the * setup of the inode and release the inode. This prevents recursive * transactions and deadlocks from xfs_inactive. */ if (du.ip) { xfs_iunlock(du.ip, XFS_ILOCK_EXCL); xfs_finish_inode_setup(du.ip); xfs_irele(du.ip); } out_parent: xfs_parent_finish(mp, du.ppargs); out_release_dquots: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); xfs_qm_dqrele(pdqp); if (unlock_dp_on_error) xfs_iunlock(dp, XFS_ILOCK_EXCL); return error; } /* * Free a symlink that has blocks associated with it. * * Note: zero length symlinks are not allowed to exist. When we set the size to * zero, also change it to a regular file so that it does not get written to * disk as a zero length symlink. The inode is on the unlinked list already, so * userspace cannot find this inode anymore, so this change is not user visible * but allows us to catch corrupt zero-length symlinks in the verifiers. */ STATIC int xfs_inactive_symlink_rmt( struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; int error; ASSERT(!xfs_need_iread_extents(&ip->i_df)); /* * We're freeing a symlink that has some * blocks allocated to it. Free the * blocks here. We know that we've got * either 1 or 2 extents and that we can * free them all in one bunmapi call. */ ASSERT(ip->i_df.if_nextents > 0 && ip->i_df.if_nextents <= 2); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) return error; xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); /* * Lock the inode, fix the size, turn it into a regular file and join it * to the transaction. Hold it so in the normal path, we still have it * locked for the second transaction. In the error paths we need it * held so the cancel won't rele it, see below. */ ip->i_disk_size = 0; VFS_I(ip)->i_mode = (VFS_I(ip)->i_mode & ~S_IFMT) | S_IFREG; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); error = xfs_symlink_remote_truncate(tp, ip); if (error) goto error_trans_cancel; error = xfs_trans_commit(tp); if (error) { ASSERT(xfs_is_shutdown(mp)); goto error_unlock; } /* * Remove the memory for extent descriptions (just bookkeeping). */ if (ip->i_df.if_bytes) xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK); ASSERT(ip->i_df.if_bytes == 0); xfs_iunlock(ip, XFS_ILOCK_EXCL); return 0; error_trans_cancel: xfs_trans_cancel(tp); error_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } /* * xfs_inactive_symlink - free a symlink */ int xfs_inactive_symlink( struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; int pathlen; trace_xfs_inactive_symlink(ip); if (xfs_is_shutdown(mp)) return -EIO; xfs_ilock(ip, XFS_ILOCK_EXCL); pathlen = (int)ip->i_disk_size; ASSERT(pathlen); if (pathlen <= 0 || pathlen > XFS_SYMLINK_MAXLEN) { xfs_alert(mp, "%s: inode (0x%llx) bad symlink length (%d)", __func__, (unsigned long long)ip->i_ino, pathlen); xfs_iunlock(ip, XFS_ILOCK_EXCL); ASSERT(0); xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK); return -EFSCORRUPTED; } /* * Inline fork state gets removed by xfs_difree() so we have nothing to * do here in that case. */ if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) { xfs_iunlock(ip, XFS_ILOCK_EXCL); return 0; } xfs_iunlock(ip, XFS_ILOCK_EXCL); /* remove the remote symlink */ return xfs_inactive_symlink_rmt(ip); } |
| 8666 3147 8435 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | // SPDX-License-Identifier: GPL-2.0 #include <linux/fault-inject.h> #include <linux/debugfs.h> #include <linux/error-injection.h> #include <linux/mm.h> static struct { struct fault_attr attr; bool ignore_gfp_highmem; bool ignore_gfp_reclaim; u32 min_order; } fail_page_alloc = { .attr = FAULT_ATTR_INITIALIZER, .ignore_gfp_reclaim = true, .ignore_gfp_highmem = true, .min_order = 1, }; static int __init setup_fail_page_alloc(char *str) { return setup_fault_attr(&fail_page_alloc.attr, str); } __setup("fail_page_alloc=", setup_fail_page_alloc); bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { int flags = 0; if (order < fail_page_alloc.min_order) return false; if (gfp_mask & __GFP_NOFAIL) return false; if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) return false; if (fail_page_alloc.ignore_gfp_reclaim && (gfp_mask & __GFP_DIRECT_RECLAIM)) return false; /* See comment in __should_failslab() */ if (gfp_mask & __GFP_NOWARN) flags |= FAULT_NOWARN; return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags); } ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE); #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS static int __init fail_page_alloc_debugfs(void) { umode_t mode = S_IFREG | 0600; struct dentry *dir; dir = fault_create_debugfs_attr("fail_page_alloc", NULL, &fail_page_alloc.attr); debugfs_create_bool("ignore-gfp-wait", mode, dir, &fail_page_alloc.ignore_gfp_reclaim); debugfs_create_bool("ignore-gfp-highmem", mode, dir, &fail_page_alloc.ignore_gfp_highmem); debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order); return 0; } late_initcall(fail_page_alloc_debugfs); #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ |
| 3570 2132 686 2908 16 2906 2909 9 3897 1485 3567 3577 3570 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 | // SPDX-License-Identifier: GPL-2.0-only /* * common LSM auditing functions * * Based on code written for SELinux by : * Stephen Smalley, <sds@tycho.nsa.gov> * James Morris <jmorris@redhat.com> * Author : Etienne Basset, <etienne.basset@ensta.org> */ #include <linux/types.h> #include <linux/stddef.h> #include <linux/kernel.h> #include <linux/gfp.h> #include <linux/fs.h> #include <linux/init.h> #include <net/sock.h> #include <linux/un.h> #include <net/af_unix.h> #include <linux/audit.h> #include <linux/ipv6.h> #include <linux/ip.h> #include <net/ip.h> #include <net/ipv6.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/dccp.h> #include <linux/sctp.h> #include <linux/lsm_audit.h> #include <linux/security.h> /** * ipv4_skb_to_auditdata : fill auditdata from skb * @skb : the skb * @ad : the audit data to fill * @proto : the layer 4 protocol * * return 0 on success */ int ipv4_skb_to_auditdata(struct sk_buff *skb, struct common_audit_data *ad, u8 *proto) { int ret = 0; struct iphdr *ih; ih = ip_hdr(skb); ad->u.net->v4info.saddr = ih->saddr; ad->u.net->v4info.daddr = ih->daddr; if (proto) *proto = ih->protocol; /* non initial fragment */ if (ntohs(ih->frag_off) & IP_OFFSET) return 0; switch (ih->protocol) { case IPPROTO_TCP: { struct tcphdr *th = tcp_hdr(skb); ad->u.net->sport = th->source; ad->u.net->dport = th->dest; break; } case IPPROTO_UDP: { struct udphdr *uh = udp_hdr(skb); ad->u.net->sport = uh->source; ad->u.net->dport = uh->dest; break; } case IPPROTO_DCCP: { struct dccp_hdr *dh = dccp_hdr(skb); ad->u.net->sport = dh->dccph_sport; ad->u.net->dport = dh->dccph_dport; break; } case IPPROTO_SCTP: { struct sctphdr *sh = sctp_hdr(skb); ad->u.net->sport = sh->source; ad->u.net->dport = sh->dest; break; } default: ret = -EINVAL; } return ret; } #if IS_ENABLED(CONFIG_IPV6) /** * ipv6_skb_to_auditdata : fill auditdata from skb * @skb : the skb * @ad : the audit data to fill * @proto : the layer 4 protocol * * return 0 on success */ int ipv6_skb_to_auditdata(struct sk_buff *skb, struct common_audit_data *ad, u8 *proto) { int offset, ret = 0; struct ipv6hdr *ip6; u8 nexthdr; __be16 frag_off; ip6 = ipv6_hdr(skb); ad->u.net->v6info.saddr = ip6->saddr; ad->u.net->v6info.daddr = ip6->daddr; /* IPv6 can have several extension header before the Transport header * skip them */ offset = skb_network_offset(skb); offset += sizeof(*ip6); nexthdr = ip6->nexthdr; offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off); if (offset < 0) return 0; if (proto) *proto = nexthdr; switch (nexthdr) { case IPPROTO_TCP: { struct tcphdr _tcph, *th; th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); if (th == NULL) break; ad->u.net->sport = th->source; ad->u.net->dport = th->dest; break; } case IPPROTO_UDP: { struct udphdr _udph, *uh; uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); if (uh == NULL) break; ad->u.net->sport = uh->source; ad->u.net->dport = uh->dest; break; } case IPPROTO_DCCP: { struct dccp_hdr _dccph, *dh; dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph); if (dh == NULL) break; ad->u.net->sport = dh->dccph_sport; ad->u.net->dport = dh->dccph_dport; break; } case IPPROTO_SCTP: { struct sctphdr _sctph, *sh; sh = skb_header_pointer(skb, offset, sizeof(_sctph), &_sctph); if (sh == NULL) break; ad->u.net->sport = sh->source; ad->u.net->dport = sh->dest; break; } default: ret = -EINVAL; } return ret; } #endif static inline void print_ipv6_addr(struct audit_buffer *ab, const struct in6_addr *addr, __be16 port, char *name1, char *name2) { if (!ipv6_addr_any(addr)) audit_log_format(ab, " %s=%pI6c", name1, addr); if (port) audit_log_format(ab, " %s=%d", name2, ntohs(port)); } static inline void print_ipv4_addr(struct audit_buffer *ab, __be32 addr, __be16 port, char *name1, char *name2) { if (addr) audit_log_format(ab, " %s=%pI4", name1, &addr); if (port) audit_log_format(ab, " %s=%d", name2, ntohs(port)); } /** * dump_common_audit_data - helper to dump common audit data * @ab : the audit buffer * @a : common audit data * */ static void dump_common_audit_data(struct audit_buffer *ab, struct common_audit_data *a) { char comm[sizeof(current->comm)]; /* * To keep stack sizes in check force programmers to notice if they * start making this union too large! See struct lsm_network_audit * as an example of how to deal with large data. */ BUILD_BUG_ON(sizeof(a->u) > sizeof(void *)*2); audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current)); audit_log_untrustedstring(ab, get_task_comm(comm, current)); switch (a->type) { case LSM_AUDIT_DATA_NONE: return; case LSM_AUDIT_DATA_IPC: audit_log_format(ab, " ipc_key=%d ", a->u.ipc_id); break; case LSM_AUDIT_DATA_CAP: audit_log_format(ab, " capability=%d ", a->u.cap); break; case LSM_AUDIT_DATA_PATH: { struct inode *inode; audit_log_d_path(ab, " path=", &a->u.path); inode = d_backing_inode(a->u.path.dentry); if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); } break; } case LSM_AUDIT_DATA_FILE: { struct inode *inode; audit_log_d_path(ab, " path=", &a->u.file->f_path); inode = file_inode(a->u.file); if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); } break; } case LSM_AUDIT_DATA_IOCTL_OP: { struct inode *inode; audit_log_d_path(ab, " path=", &a->u.op->path); inode = a->u.op->path.dentry->d_inode; if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); } audit_log_format(ab, " ioctlcmd=0x%hx", a->u.op->cmd); break; } case LSM_AUDIT_DATA_DENTRY: { struct inode *inode; audit_log_format(ab, " name="); spin_lock(&a->u.dentry->d_lock); audit_log_untrustedstring(ab, a->u.dentry->d_name.name); spin_unlock(&a->u.dentry->d_lock); inode = d_backing_inode(a->u.dentry); if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); } break; } case LSM_AUDIT_DATA_INODE: { struct dentry *dentry; struct inode *inode; rcu_read_lock(); inode = a->u.inode; dentry = d_find_alias_rcu(inode); if (dentry) { audit_log_format(ab, " name="); spin_lock(&dentry->d_lock); audit_log_untrustedstring(ab, dentry->d_name.name); spin_unlock(&dentry->d_lock); } audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); rcu_read_unlock(); break; } case LSM_AUDIT_DATA_TASK: { struct task_struct *tsk = a->u.tsk; if (tsk) { pid_t pid = task_tgid_nr(tsk); if (pid) { char comm[sizeof(tsk->comm)]; audit_log_format(ab, " opid=%d ocomm=", pid); audit_log_untrustedstring(ab, get_task_comm(comm, tsk)); } } break; } case LSM_AUDIT_DATA_NET: if (a->u.net->sk) { const struct sock *sk = a->u.net->sk; const struct unix_sock *u; struct unix_address *addr; int len = 0; char *p = NULL; switch (sk->sk_family) { case AF_INET: { const struct inet_sock *inet = inet_sk(sk); print_ipv4_addr(ab, inet->inet_rcv_saddr, inet->inet_sport, "laddr", "lport"); print_ipv4_addr(ab, inet->inet_daddr, inet->inet_dport, "faddr", "fport"); break; } #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: { const struct inet_sock *inet = inet_sk(sk); print_ipv6_addr(ab, &sk->sk_v6_rcv_saddr, inet->inet_sport, "laddr", "lport"); print_ipv6_addr(ab, &sk->sk_v6_daddr, inet->inet_dport, "faddr", "fport"); break; } #endif case AF_UNIX: u = unix_sk(sk); addr = smp_load_acquire(&u->addr); if (!addr) break; if (u->path.dentry) { audit_log_d_path(ab, " path=", &u->path); break; } len = addr->len-sizeof(short); p = &addr->name->sun_path[0]; audit_log_format(ab, " path="); if (*p) audit_log_untrustedstring(ab, p); else audit_log_n_hex(ab, p, len); break; } } switch (a->u.net->family) { case AF_INET: print_ipv4_addr(ab, a->u.net->v4info.saddr, a->u.net->sport, "saddr", "src"); print_ipv4_addr(ab, a->u.net->v4info.daddr, a->u.net->dport, "daddr", "dest"); break; case AF_INET6: print_ipv6_addr(ab, &a->u.net->v6info.saddr, a->u.net->sport, "saddr", "src"); print_ipv6_addr(ab, &a->u.net->v6info.daddr, a->u.net->dport, "daddr", "dest"); break; } if (a->u.net->netif > 0) { struct net_device *dev; /* NOTE: we always use init's namespace */ dev = dev_get_by_index(&init_net, a->u.net->netif); if (dev) { audit_log_format(ab, " netif=%s", dev->name); dev_put(dev); } } break; #ifdef CONFIG_KEYS case LSM_AUDIT_DATA_KEY: audit_log_format(ab, " key_serial=%u", a->u.key_struct.key); if (a->u.key_struct.key_desc) { audit_log_format(ab, " key_desc="); audit_log_untrustedstring(ab, a->u.key_struct.key_desc); } break; #endif case LSM_AUDIT_DATA_KMOD: audit_log_format(ab, " kmod="); audit_log_untrustedstring(ab, a->u.kmod_name); break; case LSM_AUDIT_DATA_IBPKEY: { struct in6_addr sbn_pfx; memset(&sbn_pfx.s6_addr, 0, sizeof(sbn_pfx.s6_addr)); memcpy(&sbn_pfx.s6_addr, &a->u.ibpkey->subnet_prefix, sizeof(a->u.ibpkey->subnet_prefix)); audit_log_format(ab, " pkey=0x%x subnet_prefix=%pI6c", a->u.ibpkey->pkey, &sbn_pfx); break; } case LSM_AUDIT_DATA_IBENDPORT: audit_log_format(ab, " device=%s port_num=%u", a->u.ibendport->dev_name, a->u.ibendport->port); break; case LSM_AUDIT_DATA_LOCKDOWN: audit_log_format(ab, " lockdown_reason=\"%s\"", lockdown_reasons[a->u.reason]); break; case LSM_AUDIT_DATA_ANONINODE: audit_log_format(ab, " anonclass=%s", a->u.anonclass); break; } /* switch (a->type) */ } /** * common_lsm_audit - generic LSM auditing function * @a: auxiliary audit data * @pre_audit: lsm-specific pre-audit callback * @post_audit: lsm-specific post-audit callback * * setup the audit buffer for common security information * uses callback to print LSM specific information */ void common_lsm_audit(struct common_audit_data *a, void (*pre_audit)(struct audit_buffer *, void *), void (*post_audit)(struct audit_buffer *, void *)) { struct audit_buffer *ab; if (a == NULL) return; /* we use GFP_ATOMIC so we won't sleep */ ab = audit_log_start(audit_context(), GFP_ATOMIC | __GFP_NOWARN, AUDIT_AVC); if (ab == NULL) return; if (pre_audit) pre_audit(ab, a); dump_common_audit_data(ab, a); if (post_audit) post_audit(ab, a); audit_log_end(ab); } |
| 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 | // SPDX-License-Identifier: GPL-2.0 /* * Central processing for nfsd. * * Authors: Olaf Kirch (okir@monad.swb.de) * * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> */ #include <linux/sched/signal.h> #include <linux/freezer.h> #include <linux/module.h> #include <linux/fs_struct.h> #include <linux/swap.h> #include <linux/siphash.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/svcsock.h> #include <linux/sunrpc/svc_xprt.h> #include <linux/lockd/bind.h> #include <linux/nfsacl.h> #include <linux/nfslocalio.h> #include <linux/seq_file.h> #include <linux/inetdevice.h> #include <net/addrconf.h> #include <net/ipv6.h> #include <net/net_namespace.h> #include "nfsd.h" #include "cache.h" #include "vfs.h" #include "netns.h" #include "filecache.h" #include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_SVC atomic_t nfsd_th_cnt = ATOMIC_INIT(0); static int nfsd(void *vrqstp); #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) static int nfsd_acl_rpcbind_set(struct net *, const struct svc_program *, u32, int, unsigned short, unsigned short); static __be32 nfsd_acl_init_request(struct svc_rqst *, const struct svc_program *, struct svc_process_info *); #endif static int nfsd_rpcbind_set(struct net *, const struct svc_program *, u32, int, unsigned short, unsigned short); static __be32 nfsd_init_request(struct svc_rqst *, const struct svc_program *, struct svc_process_info *); /* * nfsd_mutex protects nn->nfsd_serv -- both the pointer itself and some members * of the svc_serv struct such as ->sv_temp_socks and ->sv_permsocks. * * Finally, the nfsd_mutex also protects some of the global variables that are * accessed when nfsd starts and that are settable via the write_* routines in * nfsctl.c. In particular: * * user_recovery_dirname * user_lease_time * nfsd_versions */ DEFINE_MUTEX(nfsd_mutex); /* * nfsd_drc_lock protects nfsd_drc_max_pages and nfsd_drc_pages_used. * nfsd_drc_max_pages limits the total amount of memory available for * version 4.1 DRC caches. * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage. */ DEFINE_SPINLOCK(nfsd_drc_lock); unsigned long nfsd_drc_max_mem; unsigned long nfsd_drc_mem_used; #if IS_ENABLED(CONFIG_NFS_LOCALIO) static const struct svc_version *localio_versions[] = { [1] = &localio_version1, }; #define NFSD_LOCALIO_NRVERS ARRAY_SIZE(localio_versions) #endif /* CONFIG_NFS_LOCALIO */ #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) static const struct svc_version *nfsd_acl_version[] = { # if defined(CONFIG_NFSD_V2_ACL) [2] = &nfsd_acl_version2, # endif # if defined(CONFIG_NFSD_V3_ACL) [3] = &nfsd_acl_version3, # endif }; #define NFSD_ACL_MINVERS 2 #define NFSD_ACL_NRVERS ARRAY_SIZE(nfsd_acl_version) #endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ static const struct svc_version *nfsd_version[NFSD_MAXVERS+1] = { #if defined(CONFIG_NFSD_V2) [2] = &nfsd_version2, #endif [3] = &nfsd_version3, #if defined(CONFIG_NFSD_V4) [4] = &nfsd_version4, #endif }; struct svc_program nfsd_programs[] = { { .pg_prog = NFS_PROGRAM, /* program number */ .pg_nvers = NFSD_MAXVERS+1, /* nr of entries in nfsd_version */ .pg_vers = nfsd_version, /* version table */ .pg_name = "nfsd", /* program name */ .pg_class = "nfsd", /* authentication class */ .pg_authenticate = svc_set_client, /* export authentication */ .pg_init_request = nfsd_init_request, .pg_rpcbind_set = nfsd_rpcbind_set, }, #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) { .pg_prog = NFS_ACL_PROGRAM, .pg_nvers = NFSD_ACL_NRVERS, .pg_vers = nfsd_acl_version, .pg_name = "nfsacl", .pg_class = "nfsd", .pg_authenticate = svc_set_client, .pg_init_request = nfsd_acl_init_request, .pg_rpcbind_set = nfsd_acl_rpcbind_set, }, #endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ #if IS_ENABLED(CONFIG_NFS_LOCALIO) { .pg_prog = NFS_LOCALIO_PROGRAM, .pg_nvers = NFSD_LOCALIO_NRVERS, .pg_vers = localio_versions, .pg_name = "nfslocalio", .pg_class = "nfsd", .pg_authenticate = svc_set_client, .pg_init_request = svc_generic_init_request, .pg_rpcbind_set = svc_generic_rpcbind_set, } #endif /* CONFIG_NFS_LOCALIO */ }; bool nfsd_support_version(int vers) { if (vers >= NFSD_MINVERS && vers <= NFSD_MAXVERS) return nfsd_version[vers] != NULL; return false; } int nfsd_vers(struct nfsd_net *nn, int vers, enum vers_op change) { if (vers < NFSD_MINVERS || vers > NFSD_MAXVERS) return 0; switch(change) { case NFSD_SET: nn->nfsd_versions[vers] = nfsd_support_version(vers); break; case NFSD_CLEAR: nn->nfsd_versions[vers] = false; break; case NFSD_TEST: return nn->nfsd_versions[vers]; case NFSD_AVAIL: return nfsd_support_version(vers); } return 0; } static void nfsd_adjust_nfsd_versions4(struct nfsd_net *nn) { unsigned i; for (i = 0; i <= NFSD_SUPPORTED_MINOR_VERSION; i++) { if (nn->nfsd4_minorversions[i]) return; } nfsd_vers(nn, 4, NFSD_CLEAR); } int nfsd_minorversion(struct nfsd_net *nn, u32 minorversion, enum vers_op change) { if (minorversion > NFSD_SUPPORTED_MINOR_VERSION && change != NFSD_AVAIL) return -1; switch(change) { case NFSD_SET: nfsd_vers(nn, 4, NFSD_SET); nn->nfsd4_minorversions[minorversion] = nfsd_vers(nn, 4, NFSD_TEST); break; case NFSD_CLEAR: nn->nfsd4_minorversions[minorversion] = false; nfsd_adjust_nfsd_versions4(nn); break; case NFSD_TEST: return nn->nfsd4_minorversions[minorversion]; case NFSD_AVAIL: return minorversion <= NFSD_SUPPORTED_MINOR_VERSION && nfsd_vers(nn, 4, NFSD_AVAIL); } return 0; } bool nfsd_serv_try_get(struct net *net) __must_hold(rcu) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); return (nn && percpu_ref_tryget_live(&nn->nfsd_serv_ref)); } void nfsd_serv_put(struct net *net) __must_hold(rcu) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); percpu_ref_put(&nn->nfsd_serv_ref); } static void nfsd_serv_done(struct percpu_ref *ref) { struct nfsd_net *nn = container_of(ref, struct nfsd_net, nfsd_serv_ref); complete(&nn->nfsd_serv_confirm_done); } static void nfsd_serv_free(struct percpu_ref *ref) { struct nfsd_net *nn = container_of(ref, struct nfsd_net, nfsd_serv_ref); complete(&nn->nfsd_serv_free_done); } /* * Maximum number of nfsd processes */ #define NFSD_MAXSERVS 8192 int nfsd_nrthreads(struct net *net) { int rv = 0; struct nfsd_net *nn = net_generic(net, nfsd_net_id); mutex_lock(&nfsd_mutex); if (nn->nfsd_serv) rv = nn->nfsd_serv->sv_nrthreads; mutex_unlock(&nfsd_mutex); return rv; } static int nfsd_init_socks(struct net *net, const struct cred *cred) { int error; struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (!list_empty(&nn->nfsd_serv->sv_permsocks)) return 0; error = svc_xprt_create(nn->nfsd_serv, "udp", net, PF_INET, NFS_PORT, SVC_SOCK_DEFAULTS, cred); if (error < 0) return error; error = svc_xprt_create(nn->nfsd_serv, "tcp", net, PF_INET, NFS_PORT, SVC_SOCK_DEFAULTS, cred); if (error < 0) return error; return 0; } static int nfsd_users = 0; static int nfsd_startup_generic(void) { int ret; if (nfsd_users++) return 0; ret = nfsd_file_cache_init(); if (ret) goto dec_users; ret = nfs4_state_start(); if (ret) goto out_file_cache; return 0; out_file_cache: nfsd_file_cache_shutdown(); dec_users: nfsd_users--; return ret; } static void nfsd_shutdown_generic(void) { if (--nfsd_users) return; nfs4_state_shutdown(); nfsd_file_cache_shutdown(); } static bool nfsd_needs_lockd(struct nfsd_net *nn) { return nfsd_vers(nn, 2, NFSD_TEST) || nfsd_vers(nn, 3, NFSD_TEST); } /** * nfsd_copy_write_verifier - Atomically copy a write verifier * @verf: buffer in which to receive the verifier cookie * @nn: NFS net namespace * * This function provides a wait-free mechanism for copying the * namespace's write verifier without tearing it. */ void nfsd_copy_write_verifier(__be32 verf[2], struct nfsd_net *nn) { unsigned int seq; do { seq = read_seqbegin(&nn->writeverf_lock); memcpy(verf, nn->writeverf, sizeof(nn->writeverf)); } while (read_seqretry(&nn->writeverf_lock, seq)); } static void nfsd_reset_write_verifier_locked(struct nfsd_net *nn) { struct timespec64 now; u64 verf; /* * Because the time value is hashed, y2038 time_t overflow * is irrelevant in this usage. */ ktime_get_raw_ts64(&now); verf = siphash_2u64(now.tv_sec, now.tv_nsec, &nn->siphash_key); memcpy(nn->writeverf, &verf, sizeof(nn->writeverf)); } /** * nfsd_reset_write_verifier - Generate a new write verifier * @nn: NFS net namespace * * This function updates the ->writeverf field of @nn. This field * contains an opaque cookie that, according to Section 18.32.3 of * RFC 8881, "the client can use to determine whether a server has * changed instance state (e.g., server restart) between a call to * WRITE and a subsequent call to either WRITE or COMMIT. This * cookie MUST be unchanged during a single instance of the NFSv4.1 * server and MUST be unique between instances of the NFSv4.1 * server." */ void nfsd_reset_write_verifier(struct nfsd_net *nn) { write_seqlock(&nn->writeverf_lock); nfsd_reset_write_verifier_locked(nn); write_sequnlock(&nn->writeverf_lock); } /* * Crank up a set of per-namespace resources for a new NFSD instance, * including lockd, a duplicate reply cache, an open file cache * instance, and a cache of NFSv4 state objects. */ static int nfsd_startup_net(struct net *net, const struct cred *cred) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); int ret; if (nn->nfsd_net_up) return 0; ret = nfsd_startup_generic(); if (ret) return ret; ret = nfsd_init_socks(net, cred); if (ret) goto out_socks; if (nfsd_needs_lockd(nn) && !nn->lockd_up) { ret = lockd_up(net, cred); if (ret) goto out_socks; nn->lockd_up = true; } ret = nfsd_file_cache_start_net(net); if (ret) goto out_lockd; ret = nfsd_reply_cache_init(nn); if (ret) goto out_filecache; ret = nfs4_state_start_net(net); if (ret) goto out_reply_cache; #ifdef CONFIG_NFSD_V4_2_INTER_SSC nfsd4_ssc_init_umount_work(nn); #endif nn->nfsd_net_up = true; return 0; out_reply_cache: nfsd_reply_cache_shutdown(nn); out_filecache: nfsd_file_cache_shutdown_net(net); out_lockd: if (nn->lockd_up) { lockd_down(net); nn->lockd_up = false; } out_socks: nfsd_shutdown_generic(); return ret; } static void nfsd_shutdown_net(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (!nn->nfsd_net_up) return; nfsd_export_flush(net); nfs4_state_shutdown_net(net); nfsd_reply_cache_shutdown(nn); nfsd_file_cache_shutdown_net(net); if (nn->lockd_up) { lockd_down(net); nn->lockd_up = false; } percpu_ref_exit(&nn->nfsd_serv_ref); nn->nfsd_net_up = false; nfsd_shutdown_generic(); } static DEFINE_SPINLOCK(nfsd_notifier_lock); static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; struct net_device *dev = ifa->ifa_dev->dev; struct net *net = dev_net(dev); struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct sockaddr_in sin; if (event != NETDEV_DOWN || !nn->nfsd_serv) goto out; spin_lock(&nfsd_notifier_lock); if (nn->nfsd_serv) { dprintk("nfsd_inetaddr_event: removed %pI4\n", &ifa->ifa_local); sin.sin_family = AF_INET; sin.sin_addr.s_addr = ifa->ifa_local; svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin); } spin_unlock(&nfsd_notifier_lock); out: return NOTIFY_DONE; } static struct notifier_block nfsd_inetaddr_notifier = { .notifier_call = nfsd_inetaddr_event, }; #if IS_ENABLED(CONFIG_IPV6) static int nfsd_inet6addr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; struct net_device *dev = ifa->idev->dev; struct net *net = dev_net(dev); struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct sockaddr_in6 sin6; if (event != NETDEV_DOWN || !nn->nfsd_serv) goto out; spin_lock(&nfsd_notifier_lock); if (nn->nfsd_serv) { dprintk("nfsd_inet6addr_event: removed %pI6\n", &ifa->addr); sin6.sin6_family = AF_INET6; sin6.sin6_addr = ifa->addr; if (ipv6_addr_type(&sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL) sin6.sin6_scope_id = ifa->idev->dev->ifindex; svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin6); } spin_unlock(&nfsd_notifier_lock); out: return NOTIFY_DONE; } static struct notifier_block nfsd_inet6addr_notifier = { .notifier_call = nfsd_inet6addr_event, }; #endif /* Only used under nfsd_mutex, so this atomic may be overkill: */ static atomic_t nfsd_notifier_refcount = ATOMIC_INIT(0); /** * nfsd_destroy_serv - tear down NFSD's svc_serv for a namespace * @net: network namespace the NFS service is associated with */ void nfsd_destroy_serv(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct svc_serv *serv = nn->nfsd_serv; lockdep_assert_held(&nfsd_mutex); percpu_ref_kill_and_confirm(&nn->nfsd_serv_ref, nfsd_serv_done); wait_for_completion(&nn->nfsd_serv_confirm_done); wait_for_completion(&nn->nfsd_serv_free_done); /* percpu_ref_exit is called in nfsd_shutdown_net */ spin_lock(&nfsd_notifier_lock); nn->nfsd_serv = NULL; spin_unlock(&nfsd_notifier_lock); /* check if the notifier still has clients */ if (atomic_dec_return(&nfsd_notifier_refcount) == 0) { unregister_inetaddr_notifier(&nfsd_inetaddr_notifier); #if IS_ENABLED(CONFIG_IPV6) unregister_inet6addr_notifier(&nfsd_inet6addr_notifier); #endif } svc_xprt_destroy_all(serv, net); /* * write_ports can create the server without actually starting * any threads--if we get shut down before any threads are * started, then nfsd_destroy_serv will be run before any of this * other initialization has been done except the rpcb information. */ svc_rpcb_cleanup(serv, net); nfsd_shutdown_net(net); svc_destroy(&serv); } void nfsd_reset_versions(struct nfsd_net *nn) { int i; for (i = 0; i <= NFSD_MAXVERS; i++) if (nfsd_vers(nn, i, NFSD_TEST)) return; for (i = 0; i <= NFSD_MAXVERS; i++) if (i != 4) nfsd_vers(nn, i, NFSD_SET); else { int minor = 0; while (nfsd_minorversion(nn, minor, NFSD_SET) >= 0) minor++; } } /* * Each session guarantees a negotiated per slot memory cache for replies * which in turn consumes memory beyond the v2/v3/v4.0 server. A dedicated * NFSv4.1 server might want to use more memory for a DRC than a machine * with mutiple services. * * Impose a hard limit on the number of pages for the DRC which varies * according to the machines free pages. This is of course only a default. * * For now this is a #defined shift which could be under admin control * in the future. */ static void set_max_drc(void) { #define NFSD_DRC_SIZE_SHIFT 7 nfsd_drc_max_mem = (nr_free_buffer_pages() >> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE; nfsd_drc_mem_used = 0; dprintk("%s nfsd_drc_max_mem %lu \n", __func__, nfsd_drc_max_mem); } static int nfsd_get_default_max_blksize(void) { struct sysinfo i; unsigned long long target; unsigned long ret; si_meminfo(&i); target = (i.totalram - i.totalhigh) << PAGE_SHIFT; /* * Aim for 1/4096 of memory per thread This gives 1MB on 4Gig * machines, but only uses 32K on 128M machines. Bottom out at * 8K on 32M and smaller. Of course, this is only a default. */ target >>= 12; ret = NFSSVC_MAXBLKSIZE; while (ret > target && ret >= 8*1024*2) ret /= 2; return ret; } void nfsd_shutdown_threads(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct svc_serv *serv; mutex_lock(&nfsd_mutex); serv = nn->nfsd_serv; if (serv == NULL) { mutex_unlock(&nfsd_mutex); return; } /* Kill outstanding nfsd threads */ svc_set_num_threads(serv, NULL, 0); nfsd_destroy_serv(net); mutex_unlock(&nfsd_mutex); } struct svc_rqst *nfsd_current_rqst(void) { if (kthread_func(current) == nfsd) return kthread_data(current); return NULL; } int nfsd_create_serv(struct net *net) { int error; struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct svc_serv *serv; WARN_ON(!mutex_is_locked(&nfsd_mutex)); if (nn->nfsd_serv) return 0; error = percpu_ref_init(&nn->nfsd_serv_ref, nfsd_serv_free, 0, GFP_KERNEL); if (error) return error; init_completion(&nn->nfsd_serv_free_done); init_completion(&nn->nfsd_serv_confirm_done); if (nfsd_max_blksize == 0) nfsd_max_blksize = nfsd_get_default_max_blksize(); nfsd_reset_versions(nn); serv = svc_create_pooled(nfsd_programs, ARRAY_SIZE(nfsd_programs), &nn->nfsd_svcstats, nfsd_max_blksize, nfsd); if (serv == NULL) return -ENOMEM; serv->sv_maxconn = nn->max_connections; error = svc_bind(serv, net); if (error < 0) { svc_destroy(&serv); return error; } spin_lock(&nfsd_notifier_lock); nn->nfsd_serv = serv; spin_unlock(&nfsd_notifier_lock); set_max_drc(); /* check if the notifier is already set */ if (atomic_inc_return(&nfsd_notifier_refcount) == 1) { register_inetaddr_notifier(&nfsd_inetaddr_notifier); #if IS_ENABLED(CONFIG_IPV6) register_inet6addr_notifier(&nfsd_inet6addr_notifier); #endif } nfsd_reset_write_verifier(nn); return 0; } int nfsd_nrpools(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (nn->nfsd_serv == NULL) return 0; else return nn->nfsd_serv->sv_nrpools; } int nfsd_get_nrthreads(int n, int *nthreads, struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct svc_serv *serv = nn->nfsd_serv; int i; if (serv) for (i = 0; i < serv->sv_nrpools && i < n; i++) nthreads[i] = serv->sv_pools[i].sp_nrthreads; return 0; } /** * nfsd_set_nrthreads - set the number of running threads in the net's service * @n: number of array members in @nthreads * @nthreads: array of thread counts for each pool * @net: network namespace to operate within * * This function alters the number of running threads for the given network * namespace in each pool. If passed an array longer then the number of pools * the extra pool settings are ignored. If passed an array shorter than the * number of pools, the missing values are interpreted as 0's. * * Returns 0 on success or a negative errno on error. */ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) { int i = 0; int tot = 0; int err = 0; struct nfsd_net *nn = net_generic(net, nfsd_net_id); lockdep_assert_held(&nfsd_mutex); if (nn->nfsd_serv == NULL || n <= 0) return 0; /* * Special case: When n == 1, pass in NULL for the pool, so that the * change is distributed equally among them. */ if (n == 1) return svc_set_num_threads(nn->nfsd_serv, NULL, nthreads[0]); if (n > nn->nfsd_serv->sv_nrpools) n = nn->nfsd_serv->sv_nrpools; /* enforce a global maximum number of threads */ tot = 0; for (i = 0; i < n; i++) { nthreads[i] = min(nthreads[i], NFSD_MAXSERVS); tot += nthreads[i]; } if (tot > NFSD_MAXSERVS) { /* total too large: scale down requested numbers */ for (i = 0; i < n && tot > 0; i++) { int new = nthreads[i] * NFSD_MAXSERVS / tot; tot -= (nthreads[i] - new); nthreads[i] = new; } for (i = 0; i < n && tot > 0; i++) { nthreads[i]--; tot--; } } /* apply the new numbers */ for (i = 0; i < n; i++) { err = svc_set_num_threads(nn->nfsd_serv, &nn->nfsd_serv->sv_pools[i], nthreads[i]); if (err) goto out; } /* Anything undefined in array is considered to be 0 */ for (i = n; i < nn->nfsd_serv->sv_nrpools; ++i) { err = svc_set_num_threads(nn->nfsd_serv, &nn->nfsd_serv->sv_pools[i], 0); if (err) goto out; } out: return err; } /** * nfsd_svc: start up or shut down the nfsd server * @n: number of array members in @nthreads * @nthreads: array of thread counts for each pool * @net: network namespace to operate within * @cred: credentials to use for xprt creation * @scope: server scope value (defaults to nodename) * * Adjust the number of threads in each pool and return the new * total number of threads in the service. */ int nfsd_svc(int n, int *nthreads, struct net *net, const struct cred *cred, const char *scope) { int error; struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct svc_serv *serv; lockdep_assert_held(&nfsd_mutex); dprintk("nfsd: creating service\n"); strscpy(nn->nfsd_name, scope ? scope : utsname()->nodename, sizeof(nn->nfsd_name)); error = nfsd_create_serv(net); if (error) goto out; serv = nn->nfsd_serv; error = nfsd_startup_net(net, cred); if (error) goto out_put; error = nfsd_set_nrthreads(n, nthreads, net); if (error) goto out_put; error = serv->sv_nrthreads; out_put: if (serv->sv_nrthreads == 0) nfsd_destroy_serv(net); out: return error; } #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) static bool nfsd_support_acl_version(int vers) { if (vers >= NFSD_ACL_MINVERS && vers < NFSD_ACL_NRVERS) return nfsd_acl_version[vers] != NULL; return false; } static int nfsd_acl_rpcbind_set(struct net *net, const struct svc_program *progp, u32 version, int family, unsigned short proto, unsigned short port) { if (!nfsd_support_acl_version(version) || !nfsd_vers(net_generic(net, nfsd_net_id), version, NFSD_TEST)) return 0; return svc_generic_rpcbind_set(net, progp, version, family, proto, port); } static __be32 nfsd_acl_init_request(struct svc_rqst *rqstp, const struct svc_program *progp, struct svc_process_info *ret) { struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); int i; if (likely(nfsd_support_acl_version(rqstp->rq_vers) && nfsd_vers(nn, rqstp->rq_vers, NFSD_TEST))) return svc_generic_init_request(rqstp, progp, ret); ret->mismatch.lovers = NFSD_ACL_NRVERS; for (i = NFSD_ACL_MINVERS; i < NFSD_ACL_NRVERS; i++) { if (nfsd_support_acl_version(rqstp->rq_vers) && nfsd_vers(nn, i, NFSD_TEST)) { ret->mismatch.lovers = i; break; } } if (ret->mismatch.lovers == NFSD_ACL_NRVERS) return rpc_prog_unavail; ret->mismatch.hivers = NFSD_ACL_MINVERS; for (i = NFSD_ACL_NRVERS - 1; i >= NFSD_ACL_MINVERS; i--) { if (nfsd_support_acl_version(rqstp->rq_vers) && nfsd_vers(nn, i, NFSD_TEST)) { ret->mismatch.hivers = i; break; } } return rpc_prog_mismatch; } #endif static int nfsd_rpcbind_set(struct net *net, const struct svc_program *progp, u32 version, int family, unsigned short proto, unsigned short port) { if (!nfsd_vers(net_generic(net, nfsd_net_id), version, NFSD_TEST)) return 0; return svc_generic_rpcbind_set(net, progp, version, family, proto, port); } static __be32 nfsd_init_request(struct svc_rqst *rqstp, const struct svc_program *progp, struct svc_process_info *ret) { struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); int i; if (likely(nfsd_vers(nn, rqstp->rq_vers, NFSD_TEST))) return svc_generic_init_request(rqstp, progp, ret); ret->mismatch.lovers = NFSD_MAXVERS + 1; for (i = NFSD_MINVERS; i <= NFSD_MAXVERS; i++) { if (nfsd_vers(nn, i, NFSD_TEST)) { ret->mismatch.lovers = i; break; } } if (ret->mismatch.lovers > NFSD_MAXVERS) return rpc_prog_unavail; ret->mismatch.hivers = NFSD_MINVERS; for (i = NFSD_MAXVERS; i >= NFSD_MINVERS; i--) { if (nfsd_vers(nn, i, NFSD_TEST)) { ret->mismatch.hivers = i; break; } } return rpc_prog_mismatch; } /* * This is the NFS server kernel thread */ static int nfsd(void *vrqstp) { struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp; struct svc_xprt *perm_sock = list_entry(rqstp->rq_server->sv_permsocks.next, typeof(struct svc_xprt), xpt_list); struct net *net = perm_sock->xpt_net; struct nfsd_net *nn = net_generic(net, nfsd_net_id); /* At this point, the thread shares current->fs * with the init process. We need to create files with the * umask as defined by the client instead of init's umask. */ svc_thread_init_status(rqstp, unshare_fs_struct()); current->fs->umask = 0; atomic_inc(&nfsd_th_cnt); set_freezable(); /* * The main request loop */ while (!svc_thread_should_stop(rqstp)) { /* Update sv_maxconn if it has changed */ rqstp->rq_server->sv_maxconn = nn->max_connections; svc_recv(rqstp); nfsd_file_net_dispose(nn); } atomic_dec(&nfsd_th_cnt); /* Release the thread */ svc_exit_thread(rqstp); return 0; } /** * nfsd_dispatch - Process an NFS or NFSACL or LOCALIO Request * @rqstp: incoming request * * This RPC dispatcher integrates the NFS server's duplicate reply cache. * * Return values: * %0: Processing complete; do not send a Reply * %1: Processing complete; send Reply in rqstp->rq_res */ int nfsd_dispatch(struct svc_rqst *rqstp) { const struct svc_procedure *proc = rqstp->rq_procinfo; __be32 *statp = rqstp->rq_accept_statp; struct nfsd_cacherep *rp; unsigned int start, len; __be32 *nfs_reply; /* * Give the xdr decoder a chance to change this if it wants * (necessary in the NFSv4.0 compound case) */ rqstp->rq_cachetype = proc->pc_cachetype; /* * ->pc_decode advances the argument stream past the NFS * Call header, so grab the header's starting location and * size now for the call to nfsd_cache_lookup(). */ start = xdr_stream_pos(&rqstp->rq_arg_stream); len = xdr_stream_remaining(&rqstp->rq_arg_stream); if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream)) goto out_decode_err; /* * Release rq_status_counter setting it to an odd value after the rpc * request has been properly parsed. rq_status_counter is used to * notify the consumers if the rqstp fields are stable * (rq_status_counter is odd) or not meaningful (rq_status_counter * is even). */ smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter | 1); rp = NULL; switch (nfsd_cache_lookup(rqstp, start, len, &rp)) { case RC_DOIT: break; case RC_REPLY: goto out_cached_reply; case RC_DROPIT: goto out_dropit; } nfs_reply = xdr_inline_decode(&rqstp->rq_res_stream, 0); *statp = proc->pc_func(rqstp); if (test_bit(RQ_DROPME, &rqstp->rq_flags)) goto out_update_drop; if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream)) goto out_encode_err; /* * Release rq_status_counter setting it to an even value after the rpc * request has been properly processed. */ smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter + 1); nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, nfs_reply); out_cached_reply: return 1; out_decode_err: trace_nfsd_garbage_args_err(rqstp); *statp = rpc_garbage_args; return 1; out_update_drop: nfsd_cache_update(rqstp, rp, RC_NOCACHE, NULL); out_dropit: return 0; out_encode_err: trace_nfsd_cant_encode_err(rqstp); nfsd_cache_update(rqstp, rp, RC_NOCACHE, NULL); *statp = rpc_system_err; return 1; } /** * nfssvc_decode_voidarg - Decode void arguments * @rqstp: Server RPC transaction context * @xdr: XDR stream positioned at arguments to decode * * Return values: * %false: Arguments were not valid * %true: Decoding was successful */ bool nfssvc_decode_voidarg(struct svc_rqst *rqstp, struct xdr_stream *xdr) { return true; } /** * nfssvc_encode_voidres - Encode void results * @rqstp: Server RPC transaction context * @xdr: XDR stream into which to encode results * * Return values: * %false: Local error while encoding * %true: Encoding was successful */ bool nfssvc_encode_voidres(struct svc_rqst *rqstp, struct xdr_stream *xdr) { return true; } |
| 12 130 129 129 108 130 66 103 46 46 46 2 45 1 4 2 2 46 13 13 6 12 8 5 11 108 108 5 103 108 108 102 108 108 1 1 55 103 103 13 13 13 6 13 13 13 13 11 11 12 12 12 12 6 131 131 131 131 131 131 129 131 131 131 105 105 2 2 2 2 2 2 2 2 6 6 6 49 49 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 | // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "bcachefs_ioctl.h" #include "btree_cache.h" #include "btree_journal_iter.h" #include "btree_update.h" #include "btree_write_buffer.h" #include "buckets.h" #include "compress.h" #include "disk_accounting.h" #include "error.h" #include "journal_io.h" #include "replicas.h" /* * Notes on disk accounting: * * We have two parallel sets of counters to be concerned with, and both must be * kept in sync. * * - Persistent/on disk accounting, stored in the accounting btree and updated * via btree write buffer updates that treat new accounting keys as deltas to * apply to existing values. But reading from a write buffer btree is * expensive, so we also have * * - In memory accounting, where accounting is stored as an array of percpu * counters, indexed by an eytzinger array of disk acounting keys/bpos (which * are the same thing, excepting byte swabbing on big endian). * * Cheap to read, but non persistent. * * Disk accounting updates are generated by transactional triggers; these run as * keys enter and leave the btree, and can compare old and new versions of keys; * the output of these triggers are deltas to the various counters. * * Disk accounting updates are done as btree write buffer updates, where the * counters in the disk accounting key are deltas that will be applied to the * counter in the btree when the key is flushed by the write buffer (or journal * replay). * * To do a disk accounting update: * - initialize a disk_accounting_pos, to specify which counter is being update * - initialize counter deltas, as an array of 1-3 s64s * - call bch2_disk_accounting_mod() * * This queues up the accounting update to be done at transaction commit time. * Underneath, it's a normal btree write buffer update. * * The transaction commit path is responsible for propagating updates to the in * memory counters, with bch2_accounting_mem_mod(). * * The commit path also assigns every disk accounting update a unique version * number, based on the journal sequence number and offset within that journal * buffer; this is used by journal replay to determine which updates have been * done. * * The transaction commit path also ensures that replicas entry accounting * updates are properly marked in the superblock (so that we know whether we can * mount without data being unavailable); it will update the superblock if * bch2_accounting_mem_mod() tells it to. */ static const char * const disk_accounting_type_strs[] = { #define x(t, n, ...) [n] = #t, BCH_DISK_ACCOUNTING_TYPES() #undef x NULL }; static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos, s64 *d, unsigned nr) { struct bkey_i_accounting *acc = bkey_accounting_init(k); acc->k.p = disk_accounting_pos_to_bpos(pos); set_bkey_val_u64s(&acc->k, sizeof(struct bch_accounting) / sizeof(u64) + nr); memcpy_u64s_small(acc->v.d, d, nr); } int bch2_disk_accounting_mod(struct btree_trans *trans, struct disk_accounting_pos *k, s64 *d, unsigned nr, bool gc) { /* Normalize: */ switch (k->type) { case BCH_DISK_ACCOUNTING_replicas: bubble_sort(k->replicas.devs, k->replicas.nr_devs, u8_cmp); break; } BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS); struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; accounting_key_init(&k_i.k, k, d, nr); return likely(!gc) ? bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k) : bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); } int bch2_mod_dev_cached_sectors(struct btree_trans *trans, unsigned dev, s64 sectors, bool gc) { struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_replicas, }; bch2_replicas_entry_cached(&acc.replicas, dev); return bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc); } static inline bool is_zero(char *start, char *end) { BUG_ON(start > end); for (; start < end; start++) if (*start) return false; return true; } #define field_end(p, member) (((void *) (&p.member)) + sizeof(p.member)) int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k, enum bch_validate_flags flags) { struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, k.k->p); void *end = &acc_k + 1; int ret = 0; bkey_fsck_err_on(bversion_zero(k.k->bversion), c, accounting_key_version_0, "accounting key with version=0"); switch (acc_k.type) { case BCH_DISK_ACCOUNTING_nr_inodes: end = field_end(acc_k, nr_inodes); break; case BCH_DISK_ACCOUNTING_persistent_reserved: end = field_end(acc_k, persistent_reserved); break; case BCH_DISK_ACCOUNTING_replicas: bkey_fsck_err_on(!acc_k.replicas.nr_devs, c, accounting_key_replicas_nr_devs_0, "accounting key replicas entry with nr_devs=0"); bkey_fsck_err_on(acc_k.replicas.nr_required > acc_k.replicas.nr_devs || (acc_k.replicas.nr_required > 1 && acc_k.replicas.nr_required == acc_k.replicas.nr_devs), c, accounting_key_replicas_nr_required_bad, "accounting key replicas entry with bad nr_required"); for (unsigned i = 0; i + 1 < acc_k.replicas.nr_devs; i++) bkey_fsck_err_on(acc_k.replicas.devs[i] >= acc_k.replicas.devs[i + 1], c, accounting_key_replicas_devs_unsorted, "accounting key replicas entry with unsorted devs"); end = (void *) &acc_k.replicas + replicas_entry_bytes(&acc_k.replicas); break; case BCH_DISK_ACCOUNTING_dev_data_type: end = field_end(acc_k, dev_data_type); break; case BCH_DISK_ACCOUNTING_compression: end = field_end(acc_k, compression); break; case BCH_DISK_ACCOUNTING_snapshot: end = field_end(acc_k, snapshot); break; case BCH_DISK_ACCOUNTING_btree: end = field_end(acc_k, btree); break; case BCH_DISK_ACCOUNTING_rebalance_work: end = field_end(acc_k, rebalance_work); break; } bkey_fsck_err_on(!is_zero(end, (void *) (&acc_k + 1)), c, accounting_key_junk_at_end, "junk at end of accounting key"); fsck_err: return ret; } void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_pos *k) { if (k->type >= BCH_DISK_ACCOUNTING_TYPE_NR) { prt_printf(out, "unknown type %u", k->type); return; } prt_str(out, disk_accounting_type_strs[k->type]); prt_str(out, " "); switch (k->type) { case BCH_DISK_ACCOUNTING_nr_inodes: break; case BCH_DISK_ACCOUNTING_persistent_reserved: prt_printf(out, "replicas=%u", k->persistent_reserved.nr_replicas); break; case BCH_DISK_ACCOUNTING_replicas: bch2_replicas_entry_to_text(out, &k->replicas); break; case BCH_DISK_ACCOUNTING_dev_data_type: prt_printf(out, "dev=%u data_type=", k->dev_data_type.dev); bch2_prt_data_type(out, k->dev_data_type.data_type); break; case BCH_DISK_ACCOUNTING_compression: bch2_prt_compression_type(out, k->compression.type); break; case BCH_DISK_ACCOUNTING_snapshot: prt_printf(out, "id=%u", k->snapshot.id); break; case BCH_DISK_ACCOUNTING_btree: prt_printf(out, "btree=%s", bch2_btree_id_str(k->btree.id)); break; } } void bch2_accounting_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { struct bkey_s_c_accounting acc = bkey_s_c_to_accounting(k); struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, k.k->p); bch2_accounting_key_to_text(out, &acc_k); for (unsigned i = 0; i < bch2_accounting_counters(k.k); i++) prt_printf(out, " %lli", acc.v->d[i]); } void bch2_accounting_swab(struct bkey_s k) { for (u64 *p = (u64 *) k.v; p < (u64 *) bkey_val_end(k); p++) *p = swab64(*p); } static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r, struct disk_accounting_pos acc) { unsafe_memcpy(r, &acc.replicas, replicas_entry_bytes(&acc.replicas), "variable length struct"); } static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struct bpos p) { struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, p); switch (acc_k.type) { case BCH_DISK_ACCOUNTING_replicas: __accounting_to_replicas(r, acc_k); return true; default: return false; } } static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p) { struct bch_replicas_padded r; return accounting_to_replicas(&r.e, p) ? bch2_mark_replicas(c, &r.e) : 0; } /* * Ensure accounting keys being updated are present in the superblock, when * applicable (i.e. replicas updates) */ int bch2_accounting_update_sb(struct btree_trans *trans) { for (struct jset_entry *i = trans->journal_entries; i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); i = vstruct_next(i)) if (jset_entry_is_key(i) && i->start->k.type == KEY_TYPE_accounting) { int ret = bch2_accounting_update_sb_one(trans->c, i->start->k.p); if (ret) return ret; } return 0; } static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a) { struct bch_accounting_mem *acc = &c->accounting; /* raced with another insert, already present: */ if (eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, &a.k->p) < acc->k.nr) return 0; struct accounting_mem_entry n = { .pos = a.k->p, .bversion = a.k->bversion, .nr_counters = bch2_accounting_counters(a.k), .v[0] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64), sizeof(u64), GFP_KERNEL), }; if (!n.v[0]) goto err; if (acc->gc_running) { n.v[1] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64), sizeof(u64), GFP_KERNEL); if (!n.v[1]) goto err; } if (darray_push(&acc->k, n)) goto err; eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, NULL); return 0; err: free_percpu(n.v[1]); free_percpu(n.v[0]); return -BCH_ERR_ENOMEM_disk_accounting; } int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, enum bch_accounting_mode mode) { struct bch_replicas_padded r; if (mode != BCH_ACCOUNTING_read && accounting_to_replicas(&r.e, a.k->p) && !bch2_replicas_marked_locked(c, &r.e)) return -BCH_ERR_btree_insert_need_mark_replicas; percpu_up_read(&c->mark_lock); percpu_down_write(&c->mark_lock); int ret = __bch2_accounting_mem_insert(c, a); percpu_up_write(&c->mark_lock); percpu_down_read(&c->mark_lock); return ret; } static bool accounting_mem_entry_is_zero(struct accounting_mem_entry *e) { for (unsigned i = 0; i < e->nr_counters; i++) if (percpu_u64_get(e->v[0] + i) || (e->v[1] && percpu_u64_get(e->v[1] + i))) return false; return true; } void bch2_accounting_mem_gc(struct bch_fs *c) { struct bch_accounting_mem *acc = &c->accounting; percpu_down_write(&c->mark_lock); struct accounting_mem_entry *dst = acc->k.data; darray_for_each(acc->k, src) { if (accounting_mem_entry_is_zero(src)) { free_percpu(src->v[0]); free_percpu(src->v[1]); } else { *dst++ = *src; } } acc->k.nr = dst - acc->k.data; eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, NULL); percpu_up_write(&c->mark_lock); } /* * Read out accounting keys for replicas entries, as an array of * bch_replicas_usage entries. * * Note: this may be deprecated/removed at smoe point in the future and replaced * with something more general, it exists to support the ioctl used by the * 'bcachefs fs usage' command. */ int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage) { struct bch_accounting_mem *acc = &c->accounting; int ret = 0; darray_init(usage); percpu_down_read(&c->mark_lock); darray_for_each(acc->k, i) { struct { struct bch_replicas_usage r; u8 pad[BCH_BKEY_PTRS_MAX]; } u; if (!accounting_to_replicas(&u.r.r, i->pos)) continue; u64 sectors; bch2_accounting_mem_read_counters(acc, i - acc->k.data, §ors, 1, false); u.r.sectors = sectors; ret = darray_make_room(usage, replicas_usage_bytes(&u.r)); if (ret) break; memcpy(&darray_top(*usage), &u.r, replicas_usage_bytes(&u.r)); usage->nr += replicas_usage_bytes(&u.r); } percpu_up_read(&c->mark_lock); if (ret) darray_exit(usage); return ret; } int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned accounting_types_mask) { struct bch_accounting_mem *acc = &c->accounting; int ret = 0; darray_init(out_buf); percpu_down_read(&c->mark_lock); darray_for_each(acc->k, i) { struct disk_accounting_pos a_p; bpos_to_disk_accounting_pos(&a_p, i->pos); if (!(accounting_types_mask & BIT(a_p.type))) continue; ret = darray_make_room(out_buf, sizeof(struct bkey_i_accounting) + sizeof(u64) * i->nr_counters); if (ret) break; struct bkey_i_accounting *a_out = bkey_accounting_init((void *) &darray_top(*out_buf)); set_bkey_val_u64s(&a_out->k, i->nr_counters); a_out->k.p = i->pos; bch2_accounting_mem_read_counters(acc, i - acc->k.data, a_out->v.d, i->nr_counters, false); if (!bch2_accounting_key_is_zero(accounting_i_to_s_c(a_out))) out_buf->nr += bkey_bytes(&a_out->k); } percpu_up_read(&c->mark_lock); if (ret) darray_exit(out_buf); return ret; } void bch2_fs_accounting_to_text(struct printbuf *out, struct bch_fs *c) { struct bch_accounting_mem *acc = &c->accounting; percpu_down_read(&c->mark_lock); out->atomic++; eytzinger0_for_each(i, acc->k.nr) { struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, acc->k.data[i].pos); bch2_accounting_key_to_text(out, &acc_k); u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); prt_str(out, ":"); for (unsigned j = 0; j < acc->k.data[i].nr_counters; j++) prt_printf(out, " %llu", v[j]); prt_newline(out); } --out->atomic; percpu_up_read(&c->mark_lock); } static void bch2_accounting_free_counters(struct bch_accounting_mem *acc, bool gc) { darray_for_each(acc->k, e) { free_percpu(e->v[gc]); e->v[gc] = NULL; } } int bch2_gc_accounting_start(struct bch_fs *c) { struct bch_accounting_mem *acc = &c->accounting; int ret = 0; percpu_down_write(&c->mark_lock); darray_for_each(acc->k, e) { e->v[1] = __alloc_percpu_gfp(e->nr_counters * sizeof(u64), sizeof(u64), GFP_KERNEL); if (!e->v[1]) { bch2_accounting_free_counters(acc, true); ret = -BCH_ERR_ENOMEM_disk_accounting; break; } } acc->gc_running = !ret; percpu_up_write(&c->mark_lock); return ret; } int bch2_gc_accounting_done(struct bch_fs *c) { struct bch_accounting_mem *acc = &c->accounting; struct btree_trans *trans = bch2_trans_get(c); struct printbuf buf = PRINTBUF; struct bpos pos = POS_MIN; int ret = 0; percpu_down_write(&c->mark_lock); while (1) { unsigned idx = eytzinger0_find_ge(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, &pos); if (idx >= acc->k.nr) break; struct accounting_mem_entry *e = acc->k.data + idx; pos = bpos_successor(e->pos); struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, e->pos); if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) continue; u64 src_v[BCH_ACCOUNTING_MAX_COUNTERS]; u64 dst_v[BCH_ACCOUNTING_MAX_COUNTERS]; unsigned nr = e->nr_counters; bch2_accounting_mem_read_counters(acc, idx, dst_v, nr, false); bch2_accounting_mem_read_counters(acc, idx, src_v, nr, true); if (memcmp(dst_v, src_v, nr * sizeof(u64))) { printbuf_reset(&buf); prt_str(&buf, "accounting mismatch for "); bch2_accounting_key_to_text(&buf, &acc_k); prt_str(&buf, ": got"); for (unsigned j = 0; j < nr; j++) prt_printf(&buf, " %llu", dst_v[j]); prt_str(&buf, " should be"); for (unsigned j = 0; j < nr; j++) prt_printf(&buf, " %llu", src_v[j]); for (unsigned j = 0; j < nr; j++) src_v[j] -= dst_v[j]; if (fsck_err(trans, accounting_mismatch, "%s", buf.buf)) { percpu_up_write(&c->mark_lock); ret = commit_do(trans, NULL, NULL, 0, bch2_disk_accounting_mod(trans, &acc_k, src_v, nr, false)); percpu_down_write(&c->mark_lock); if (ret) goto err; if (!test_bit(BCH_FS_may_go_rw, &c->flags)) { memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; accounting_key_init(&k_i.k, &acc_k, src_v, nr); bch2_accounting_mem_mod_locked(trans, bkey_i_to_s_c_accounting(&k_i.k), BCH_ACCOUNTING_normal); preempt_disable(); struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage); struct bch_fs_usage_base *src = &trans->fs_usage_delta; acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64)); preempt_enable(); } } } } err: fsck_err: percpu_up_write(&c->mark_lock); printbuf_exit(&buf); bch2_trans_put(trans); bch_err_fn(c, ret); return ret; } static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k) { struct bch_fs *c = trans->c; if (k.k->type != KEY_TYPE_accounting) return 0; percpu_down_read(&c->mark_lock); int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k), BCH_ACCOUNTING_read); percpu_up_read(&c->mark_lock); return ret; } static int bch2_disk_accounting_validate_late(struct btree_trans *trans, struct disk_accounting_pos acc, u64 *v, unsigned nr) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; int ret = 0, invalid_dev = -1; switch (acc.type) { case BCH_DISK_ACCOUNTING_replicas: { struct bch_replicas_padded r; __accounting_to_replicas(&r.e, acc); for (unsigned i = 0; i < r.e.nr_devs; i++) if (r.e.devs[i] != BCH_SB_MEMBER_INVALID && !bch2_dev_exists(c, r.e.devs[i])) { invalid_dev = r.e.devs[i]; goto invalid_device; } /* * All replicas entry checks except for invalid device are done * in bch2_accounting_validate */ BUG_ON(bch2_replicas_entry_validate(&r.e, c, &buf)); if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e), trans, accounting_replicas_not_marked, "accounting not marked in superblock replicas\n %s", (printbuf_reset(&buf), bch2_accounting_key_to_text(&buf, &acc), buf.buf))) { /* * We're not RW yet and still single threaded, dropping * and retaking lock is ok: */ percpu_up_write(&c->mark_lock); ret = bch2_mark_replicas(c, &r.e); if (ret) goto fsck_err; percpu_down_write(&c->mark_lock); } break; } case BCH_DISK_ACCOUNTING_dev_data_type: if (!bch2_dev_exists(c, acc.dev_data_type.dev)) { invalid_dev = acc.dev_data_type.dev; goto invalid_device; } break; } fsck_err: printbuf_exit(&buf); return ret; invalid_device: if (fsck_err(trans, accounting_to_invalid_device, "accounting entry points to invalid device %i\n %s", invalid_dev, (printbuf_reset(&buf), bch2_accounting_key_to_text(&buf, &acc), buf.buf))) { for (unsigned i = 0; i < nr; i++) v[i] = -v[i]; ret = commit_do(trans, NULL, NULL, 0, bch2_disk_accounting_mod(trans, &acc, v, nr, false)) ?: -BCH_ERR_remove_disk_accounting_entry; } else { ret = -BCH_ERR_remove_disk_accounting_entry; } goto fsck_err; } /* * At startup time, initialize the in memory accounting from the btree (and * journal) */ int bch2_accounting_read(struct bch_fs *c) { struct bch_accounting_mem *acc = &c->accounting; struct btree_trans *trans = bch2_trans_get(c); struct printbuf buf = PRINTBUF; int ret = for_each_btree_key(trans, iter, BTREE_ID_accounting, POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ struct bkey u; struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u); accounting_read_key(trans, k); })); if (ret) goto err; struct journal_keys *keys = &c->journal_keys; struct journal_key *dst = keys->data; move_gap(keys, keys->nr); darray_for_each(*keys, i) { if (i->k->k.type == KEY_TYPE_accounting) { struct bkey_s_c k = bkey_i_to_s_c(i->k); unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, &k.k->p); bool applied = idx < acc->k.nr && bversion_cmp(acc->k.data[idx].bversion, k.k->bversion) >= 0; if (applied) continue; if (i + 1 < &darray_top(*keys) && i[1].k->k.type == KEY_TYPE_accounting && !journal_key_cmp(i, i + 1)) { WARN_ON(bversion_cmp(i[0].k->k.bversion, i[1].k->k.bversion) >= 0); i[1].journal_seq = i[0].journal_seq; bch2_accounting_accumulate(bkey_i_to_accounting(i[1].k), bkey_s_c_to_accounting(k)); continue; } ret = accounting_read_key(trans, k); if (ret) goto err; } *dst++ = *i; } keys->gap = keys->nr = dst - keys->data; percpu_down_write(&c->mark_lock); unsigned i = 0; while (i < acc->k.nr) { unsigned idx = inorder_to_eytzinger0(i, acc->k.nr); struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, acc->k.data[idx].pos); u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; bch2_accounting_mem_read_counters(acc, idx, v, ARRAY_SIZE(v), false); /* * If the entry counters are zeroed, it should be treated as * nonexistent - it might point to an invalid device. * * Remove it, so that if it's re-added it gets re-marked in the * superblock: */ ret = bch2_is_zero(v, sizeof(v[0]) * acc->k.data[idx].nr_counters) ? -BCH_ERR_remove_disk_accounting_entry : bch2_disk_accounting_validate_late(trans, acc_k, v, acc->k.data[idx].nr_counters); if (ret == -BCH_ERR_remove_disk_accounting_entry) { free_percpu(acc->k.data[idx].v[0]); free_percpu(acc->k.data[idx].v[1]); darray_remove_item(&acc->k, &acc->k.data[idx]); eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, NULL); ret = 0; continue; } if (ret) goto fsck_err; i++; } preempt_disable(); struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage); for (unsigned i = 0; i < acc->k.nr; i++) { struct disk_accounting_pos k; bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos); u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); switch (k.type) { case BCH_DISK_ACCOUNTING_persistent_reserved: usage->reserved += v[0] * k.persistent_reserved.nr_replicas; break; case BCH_DISK_ACCOUNTING_replicas: fs_usage_data_type_to_base(usage, k.replicas.data_type, v[0]); break; case BCH_DISK_ACCOUNTING_dev_data_type: rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu(c, k.dev_data_type.dev); if (ca) { struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type]; percpu_u64_set(&d->buckets, v[0]); percpu_u64_set(&d->sectors, v[1]); percpu_u64_set(&d->fragmented, v[2]); if (k.dev_data_type.data_type == BCH_DATA_sb || k.dev_data_type.data_type == BCH_DATA_journal) usage->hidden += v[0] * ca->mi.bucket_size; } rcu_read_unlock(); break; } } preempt_enable(); fsck_err: percpu_up_write(&c->mark_lock); err: printbuf_exit(&buf); bch2_trans_put(trans); bch_err_fn(c, ret); return ret; } int bch2_dev_usage_remove(struct bch_fs *c, unsigned dev) { return bch2_trans_run(c, bch2_btree_write_buffer_flush_sync(trans) ?: for_each_btree_key_commit(trans, iter, BTREE_ID_accounting, POS_MIN, BTREE_ITER_all_snapshots, k, NULL, NULL, 0, ({ struct disk_accounting_pos acc; bpos_to_disk_accounting_pos(&acc, k.k->p); acc.type == BCH_DISK_ACCOUNTING_dev_data_type && acc.dev_data_type.dev == dev ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_accounting, k.k->p, 0) : 0; })) ?: bch2_btree_write_buffer_flush_sync(trans)); } int bch2_dev_usage_init(struct bch_dev *ca, bool gc) { struct bch_fs *c = ca->fs; struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_dev_data_type, .dev_data_type.dev = ca->dev_idx, .dev_data_type.data_type = BCH_DATA_free, }; u64 v[3] = { ca->mi.nbuckets - ca->mi.first_bucket, 0, 0 }; int ret = bch2_trans_do(c, ({ bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), gc) ?: (!gc ? bch2_trans_commit(trans, NULL, NULL, 0) : 0); })); bch_err_fn(c, ret); return ret; } void bch2_verify_accounting_clean(struct bch_fs *c) { bool mismatch = false; struct bch_fs_usage_base base = {}, base_inmem = {}; bch2_trans_run(c, for_each_btree_key(trans, iter, BTREE_ID_accounting, POS_MIN, BTREE_ITER_all_snapshots, k, ({ u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; struct bkey_s_c_accounting a = bkey_s_c_to_accounting(k); unsigned nr = bch2_accounting_counters(k.k); struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, k.k->p); if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) continue; if (acc_k.type == BCH_DISK_ACCOUNTING_inum) continue; bch2_accounting_mem_read(c, k.k->p, v, nr); if (memcmp(a.v->d, v, nr * sizeof(u64))) { struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, k); prt_str(&buf, " !="); for (unsigned j = 0; j < nr; j++) prt_printf(&buf, " %llu", v[j]); pr_err("%s", buf.buf); printbuf_exit(&buf); mismatch = true; } switch (acc_k.type) { case BCH_DISK_ACCOUNTING_persistent_reserved: base.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0]; break; case BCH_DISK_ACCOUNTING_replicas: fs_usage_data_type_to_base(&base, acc_k.replicas.data_type, a.v->d[0]); break; case BCH_DISK_ACCOUNTING_dev_data_type: { rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu(c, acc_k.dev_data_type.dev); if (!ca) { rcu_read_unlock(); continue; } v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets); v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors); v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented); rcu_read_unlock(); if (memcmp(a.v->d, v, 3 * sizeof(u64))) { struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, k); prt_str(&buf, " in mem"); for (unsigned j = 0; j < nr; j++) prt_printf(&buf, " %llu", v[j]); pr_err("dev accounting mismatch: %s", buf.buf); printbuf_exit(&buf); mismatch = true; } } } 0; }))); acc_u64s_percpu(&base_inmem.hidden, &c->usage->hidden, sizeof(base_inmem) / sizeof(u64)); #define check(x) \ if (base.x != base_inmem.x) { \ pr_err("fs_usage_base.%s mismatch: %llu != %llu", #x, base.x, base_inmem.x); \ mismatch = true; \ } //check(hidden); check(btree); check(data); check(cached); check(reserved); check(nr_inodes); WARN_ON(mismatch); } void bch2_accounting_gc_free(struct bch_fs *c) { lockdep_assert_held(&c->mark_lock); struct bch_accounting_mem *acc = &c->accounting; bch2_accounting_free_counters(acc, true); acc->gc_running = false; } void bch2_fs_accounting_exit(struct bch_fs *c) { struct bch_accounting_mem *acc = &c->accounting; bch2_accounting_free_counters(acc, false); darray_exit(&acc->k); } |
| 7 517 238 9 137 149 155 17 172 15 48 13 1 1 1 281 281 435 15 3 88 82 119 2 1 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. * All Rights Reserved. */ #ifndef __XFS_INODE_H__ #define __XFS_INODE_H__ #include "xfs_inode_buf.h" #include "xfs_inode_fork.h" #include "xfs_inode_util.h" /* * Kernel only inode definitions */ struct xfs_dinode; struct xfs_inode; struct xfs_buf; struct xfs_bmbt_irec; struct xfs_inode_log_item; struct xfs_mount; struct xfs_trans; struct xfs_dquot; typedef struct xfs_inode { /* Inode linking and identification information. */ struct xfs_mount *i_mount; /* fs mount struct ptr */ struct xfs_dquot *i_udquot; /* user dquot */ struct xfs_dquot *i_gdquot; /* group dquot */ struct xfs_dquot *i_pdquot; /* project dquot */ /* Inode location stuff */ xfs_ino_t i_ino; /* inode number (agno/agino)*/ struct xfs_imap i_imap; /* location for xfs_imap() */ /* Extent information. */ struct xfs_ifork *i_cowfp; /* copy on write extents */ struct xfs_ifork i_df; /* data fork */ struct xfs_ifork i_af; /* attribute fork */ /* Transaction and locking information. */ struct xfs_inode_log_item *i_itemp; /* logging information */ struct rw_semaphore i_lock; /* inode lock */ atomic_t i_pincount; /* inode pin count */ struct llist_node i_gclist; /* deferred inactivation list */ /* * Bitsets of inode metadata that have been checked and/or are sick. * Callers must hold i_flags_lock before accessing this field. */ uint16_t i_checked; uint16_t i_sick; spinlock_t i_flags_lock; /* inode i_flags lock */ /* Miscellaneous state. */ unsigned long i_flags; /* see defined flags below */ uint64_t i_delayed_blks; /* count of delay alloc blks */ xfs_fsize_t i_disk_size; /* number of bytes in file */ xfs_rfsblock_t i_nblocks; /* # of direct & btree blocks */ prid_t i_projid; /* owner's project id */ xfs_extlen_t i_extsize; /* basic/minimum extent size */ /* cowextsize is only used for v3 inodes, flushiter for v1/2 */ union { xfs_extlen_t i_cowextsize; /* basic cow extent size */ uint16_t i_flushiter; /* incremented on flush */ }; uint8_t i_forkoff; /* attr fork offset >> 3 */ enum xfs_metafile_type i_metatype; /* XFS_METAFILE_* */ uint16_t i_diflags; /* XFS_DIFLAG_... */ uint64_t i_diflags2; /* XFS_DIFLAG2_... */ struct timespec64 i_crtime; /* time created */ /* * Unlinked list pointers. These point to the next and previous inodes * in the AGI unlinked bucket list, respectively. These fields can * only be updated with the AGI locked. * * i_next_unlinked caches di_next_unlinked. */ xfs_agino_t i_next_unlinked; /* * If the inode is not on an unlinked list, this field is zero. If the * inode is the first element in an unlinked list, this field is * NULLAGINO. Otherwise, i_prev_unlinked points to the previous inode * in the unlinked list. */ xfs_agino_t i_prev_unlinked; /* VFS inode */ struct inode i_vnode; /* embedded VFS inode */ /* pending io completions */ spinlock_t i_ioend_lock; struct work_struct i_ioend_work; struct list_head i_ioend_list; } xfs_inode_t; static inline bool xfs_inode_on_unlinked_list(const struct xfs_inode *ip) { return ip->i_prev_unlinked != 0; } static inline bool xfs_inode_has_attr_fork(const struct xfs_inode *ip) { return ip->i_forkoff > 0; } static inline struct xfs_ifork * xfs_ifork_ptr( struct xfs_inode *ip, int whichfork) { switch (whichfork) { case XFS_DATA_FORK: return &ip->i_df; case XFS_ATTR_FORK: if (!xfs_inode_has_attr_fork(ip)) return NULL; return &ip->i_af; case XFS_COW_FORK: return ip->i_cowfp; default: ASSERT(0); return NULL; } } static inline unsigned int xfs_inode_fork_boff(struct xfs_inode *ip) { return ip->i_forkoff << 3; } static inline unsigned int xfs_inode_data_fork_size(struct xfs_inode *ip) { if (xfs_inode_has_attr_fork(ip)) return xfs_inode_fork_boff(ip); return XFS_LITINO(ip->i_mount); } static inline unsigned int xfs_inode_attr_fork_size(struct xfs_inode *ip) { if (xfs_inode_has_attr_fork(ip)) return XFS_LITINO(ip->i_mount) - xfs_inode_fork_boff(ip); return 0; } static inline unsigned int xfs_inode_fork_size( struct xfs_inode *ip, int whichfork) { switch (whichfork) { case XFS_DATA_FORK: return xfs_inode_data_fork_size(ip); case XFS_ATTR_FORK: return xfs_inode_attr_fork_size(ip); default: return 0; } } /* Convert from vfs inode to xfs inode */ static inline struct xfs_inode *XFS_I(struct inode *inode) { return container_of(inode, struct xfs_inode, i_vnode); } /* convert from xfs inode to vfs inode */ static inline struct inode *VFS_I(struct xfs_inode *ip) { return &ip->i_vnode; } /* convert from const xfs inode to const vfs inode */ static inline const struct inode *VFS_IC(const struct xfs_inode *ip) { return &ip->i_vnode; } /* * For regular files we only update the on-disk filesize when actually * writing data back to disk. Until then only the copy in the VFS inode * is uptodate. */ static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip) { if (S_ISREG(VFS_I(ip)->i_mode)) return i_size_read(VFS_I(ip)); return ip->i_disk_size; } /* * If this I/O goes past the on-disk inode size update it unless it would * be past the current in-core inode size. */ static inline xfs_fsize_t xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size) { xfs_fsize_t i_size = i_size_read(VFS_I(ip)); if (new_size > i_size || new_size < 0) new_size = i_size; return new_size > ip->i_disk_size ? new_size : 0; } /* * i_flags helper functions */ static inline void __xfs_iflags_set(xfs_inode_t *ip, unsigned long flags) { ip->i_flags |= flags; } static inline void xfs_iflags_set(xfs_inode_t *ip, unsigned long flags) { spin_lock(&ip->i_flags_lock); __xfs_iflags_set(ip, flags); spin_unlock(&ip->i_flags_lock); } static inline void xfs_iflags_clear(xfs_inode_t *ip, unsigned long flags) { spin_lock(&ip->i_flags_lock); ip->i_flags &= ~flags; spin_unlock(&ip->i_flags_lock); } static inline int __xfs_iflags_test(xfs_inode_t *ip, unsigned long flags) { return (ip->i_flags & flags); } static inline int xfs_iflags_test(xfs_inode_t *ip, unsigned long flags) { int ret; spin_lock(&ip->i_flags_lock); ret = __xfs_iflags_test(ip, flags); spin_unlock(&ip->i_flags_lock); return ret; } static inline int xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned long flags) { int ret; spin_lock(&ip->i_flags_lock); ret = ip->i_flags & flags; if (ret) ip->i_flags &= ~flags; spin_unlock(&ip->i_flags_lock); return ret; } static inline int xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned long flags) { int ret; spin_lock(&ip->i_flags_lock); ret = ip->i_flags & flags; if (!ret) ip->i_flags |= flags; spin_unlock(&ip->i_flags_lock); return ret; } static inline bool xfs_is_reflink_inode(const struct xfs_inode *ip) { return ip->i_diflags2 & XFS_DIFLAG2_REFLINK; } static inline bool xfs_is_metadir_inode(const struct xfs_inode *ip) { return ip->i_diflags2 & XFS_DIFLAG2_METADATA; } static inline bool xfs_is_internal_inode(const struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; /* Any file in the metadata directory tree is a metadata inode. */ if (xfs_has_metadir(mp)) return xfs_is_metadir_inode(ip); /* * Before metadata directories, the only metadata inodes were the * three quota files, the realtime bitmap, and the realtime summary. */ return ip->i_ino == mp->m_sb.sb_rbmino || ip->i_ino == mp->m_sb.sb_rsumino || xfs_is_quota_inode(&mp->m_sb, ip->i_ino); } bool xfs_is_always_cow_inode(const struct xfs_inode *ip); static inline bool xfs_is_cow_inode(const struct xfs_inode *ip) { return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip); } static inline bool xfs_inode_has_filedata(const struct xfs_inode *ip) { return ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0; } /* * Check if an inode has any data in the COW fork. This might be often false * even for inodes with the reflink flag when there is no pending COW operation. */ static inline bool xfs_inode_has_cow_data(const struct xfs_inode *ip) { return ip->i_cowfp && ip->i_cowfp->if_bytes; } static inline bool xfs_inode_has_bigtime(const struct xfs_inode *ip) { return ip->i_diflags2 & XFS_DIFLAG2_BIGTIME; } static inline bool xfs_inode_has_large_extent_counts(const struct xfs_inode *ip) { return ip->i_diflags2 & XFS_DIFLAG2_NREXT64; } /* * Decide if this file is a realtime file whose data allocation unit is larger * than a single filesystem block. */ static inline bool xfs_inode_has_bigrtalloc(const struct xfs_inode *ip) { return XFS_IS_REALTIME_INODE(ip) && ip->i_mount->m_sb.sb_rextsize > 1; } /* * Return the buftarg used for data allocations on a given inode. */ #define xfs_inode_buftarg(ip) \ (XFS_IS_REALTIME_INODE(ip) ? \ (ip)->i_mount->m_rtdev_targp : (ip)->i_mount->m_ddev_targp) static inline bool xfs_inode_can_atomicwrite( struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; struct xfs_buftarg *target = xfs_inode_buftarg(ip); if (mp->m_sb.sb_blocksize < target->bt_bdev_awu_min) return false; if (mp->m_sb.sb_blocksize > target->bt_bdev_awu_max) return false; return true; } /* * In-core inode flags. */ #define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */ #define XFS_ISTALE (1 << 1) /* inode has been staled */ #define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */ #define XFS_INEW (1 << 3) /* inode has just been allocated */ #define XFS_IPRESERVE_DM_FIELDS (1 << 4) /* has legacy DMAPI fields set */ #define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */ #define XFS_EOFBLOCKS_RELEASED (1 << 6) /* eofblocks were freed in ->release */ #define XFS_IFLUSHING (1 << 7) /* inode is being flushed */ #define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */ #define XFS_IPINNED (1 << __XFS_IPINNED_BIT) #define XFS_IEOFBLOCKS (1 << 9) /* has the preallocblocks tag set */ #define XFS_NEED_INACTIVE (1 << 10) /* see XFS_INACTIVATING below */ /* * If this unlinked inode is in the middle of recovery, don't let drop_inode * truncate and free the inode. This can happen if we iget the inode during * log recovery to replay a bmap operation on the inode. */ #define XFS_IRECOVERY (1 << 11) #define XFS_ICOWBLOCKS (1 << 12)/* has the cowblocks tag set */ /* * If we need to update on-disk metadata before this IRECLAIMABLE inode can be * freed, then NEED_INACTIVE will be set. Once we start the updates, the * INACTIVATING bit will be set to keep iget away from this inode. After the * inactivation completes, both flags will be cleared and the inode is a * plain old IRECLAIMABLE inode. */ #define XFS_INACTIVATING (1 << 13) /* Quotacheck is running but inode has not been added to quota counts. */ #define XFS_IQUOTAUNCHECKED (1 << 14) /* * Remap in progress. Callers that wish to update file data while * holding a shared IOLOCK or MMAPLOCK must drop the lock and retake * the lock in exclusive mode. Relocking the file will block until * IREMAPPING is cleared. */ #define XFS_IREMAPPING (1U << 15) /* All inode state flags related to inode reclaim. */ #define XFS_ALL_IRECLAIM_FLAGS (XFS_IRECLAIMABLE | \ XFS_IRECLAIM | \ XFS_NEED_INACTIVE | \ XFS_INACTIVATING) /* * Per-lifetime flags need to be reset when re-using a reclaimable inode during * inode lookup. This prevents unintended behaviour on the new inode from * ocurring. */ #define XFS_IRECLAIM_RESET_FLAGS \ (XFS_IRECLAIMABLE | XFS_IRECLAIM | \ XFS_EOFBLOCKS_RELEASED | XFS_ITRUNCATED | XFS_NEED_INACTIVE | \ XFS_INACTIVATING | XFS_IQUOTAUNCHECKED) /* * Flags for inode locking. * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield) * 1<<16 - 1<<32-1 -- lockdep annotation (integers) */ #define XFS_IOLOCK_EXCL (1u << 0) #define XFS_IOLOCK_SHARED (1u << 1) #define XFS_ILOCK_EXCL (1u << 2) #define XFS_ILOCK_SHARED (1u << 3) #define XFS_MMAPLOCK_EXCL (1u << 4) #define XFS_MMAPLOCK_SHARED (1u << 5) #define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \ | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED \ | XFS_MMAPLOCK_EXCL | XFS_MMAPLOCK_SHARED) #define XFS_LOCK_FLAGS \ { XFS_IOLOCK_EXCL, "IOLOCK_EXCL" }, \ { XFS_IOLOCK_SHARED, "IOLOCK_SHARED" }, \ { XFS_ILOCK_EXCL, "ILOCK_EXCL" }, \ { XFS_ILOCK_SHARED, "ILOCK_SHARED" }, \ { XFS_MMAPLOCK_EXCL, "MMAPLOCK_EXCL" }, \ { XFS_MMAPLOCK_SHARED, "MMAPLOCK_SHARED" } /* * Flags for lockdep annotations. * * XFS_LOCK_PARENT - for directory operations that require locking a * parent directory inode and a child entry inode. IOLOCK requires nesting, * MMAPLOCK does not support this class, ILOCK requires a single subclass * to differentiate parent from child. * * XFS_LOCK_RTBITMAP/XFS_LOCK_RTSUM - the realtime device bitmap and summary * inodes do not participate in the normal lock order, and thus have their * own subclasses. * * XFS_LOCK_INUMORDER - for locking several inodes at the some time * with xfs_lock_inodes(). This flag is used as the starting subclass * and each subsequent lock acquired will increment the subclass by one. * However, MAX_LOCKDEP_SUBCLASSES == 8, which means we are greatly * limited to the subclasses we can represent via nesting. We need at least * 5 inodes nest depth for the ILOCK through rename, and we also have to support * XFS_ILOCK_PARENT, which gives 6 subclasses. That's 6 of the 8 subclasses * supported by lockdep. * * This also means we have to number the sub-classes in the lowest bits of * the mask we keep, and we have to ensure we never exceed 3 bits of lockdep * mask and we can't use bit-masking to build the subclasses. What a mess. * * Bit layout: * * Bit Lock Region * 16-19 XFS_IOLOCK_SHIFT dependencies * 20-23 XFS_MMAPLOCK_SHIFT dependencies * 24-31 XFS_ILOCK_SHIFT dependencies * * IOLOCK values * * 0-3 subclass value * 4-7 unused * * MMAPLOCK values * * 0-3 subclass value * 4-7 unused * * ILOCK values * 0-4 subclass values * 5 PARENT subclass (not nestable) * 6 unused * 7 unused * */ #define XFS_IOLOCK_SHIFT 16 #define XFS_IOLOCK_MAX_SUBCLASS 3 #define XFS_IOLOCK_DEP_MASK 0x000f0000u #define XFS_MMAPLOCK_SHIFT 20 #define XFS_MMAPLOCK_NUMORDER 0 #define XFS_MMAPLOCK_MAX_SUBCLASS 3 #define XFS_MMAPLOCK_DEP_MASK 0x00f00000u #define XFS_ILOCK_SHIFT 24 #define XFS_ILOCK_PARENT_VAL 5u #define XFS_ILOCK_MAX_SUBCLASS (XFS_ILOCK_PARENT_VAL - 1) #define XFS_ILOCK_DEP_MASK 0xff000000u #define XFS_ILOCK_PARENT (XFS_ILOCK_PARENT_VAL << XFS_ILOCK_SHIFT) #define XFS_LOCK_SUBCLASS_MASK (XFS_IOLOCK_DEP_MASK | \ XFS_MMAPLOCK_DEP_MASK | \ XFS_ILOCK_DEP_MASK) #define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) \ >> XFS_IOLOCK_SHIFT) #define XFS_MMAPLOCK_DEP(flags) (((flags) & XFS_MMAPLOCK_DEP_MASK) \ >> XFS_MMAPLOCK_SHIFT) #define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) \ >> XFS_ILOCK_SHIFT) /* * Layouts are broken in the BREAK_WRITE case to ensure that * layout-holders do not collide with local writes. Additionally, * layouts are broken in the BREAK_UNMAP case to make sure the * layout-holder has a consistent view of the file's extent map. While * BREAK_WRITE breaks can be satisfied by recalling FL_LAYOUT leases, * BREAK_UNMAP breaks additionally require waiting for busy dax-pages to * go idle. */ enum layout_break_reason { BREAK_WRITE, BREAK_UNMAP, }; /* * For multiple groups support: if S_ISGID bit is set in the parent * directory, group of new file is set to that of the parent, and * new subdirectory gets S_ISGID bit from parent. */ #define XFS_INHERIT_GID(pip) \ (xfs_has_grpid((pip)->i_mount) || (VFS_I(pip)->i_mode & S_ISGID)) int xfs_inactive(struct xfs_inode *ip); int xfs_lookup(struct xfs_inode *dp, const struct xfs_name *name, struct xfs_inode **ipp, struct xfs_name *ci_name); int xfs_create(const struct xfs_icreate_args *iargs, struct xfs_name *name, struct xfs_inode **ipp); int xfs_create_tmpfile(const struct xfs_icreate_args *iargs, struct xfs_inode **ipp); int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode *ip); int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, struct xfs_name *target_name); int xfs_rename(struct mnt_idmap *idmap, struct xfs_inode *src_dp, struct xfs_name *src_name, struct xfs_inode *src_ip, struct xfs_inode *target_dp, struct xfs_name *target_name, struct xfs_inode *target_ip, unsigned int flags); void xfs_ilock(xfs_inode_t *, uint); int xfs_ilock_nowait(xfs_inode_t *, uint); void xfs_iunlock(xfs_inode_t *, uint); void xfs_ilock_demote(xfs_inode_t *, uint); void xfs_assert_ilocked(struct xfs_inode *, uint); uint xfs_ilock_data_map_shared(struct xfs_inode *); uint xfs_ilock_attr_map_shared(struct xfs_inode *); int xfs_ifree(struct xfs_trans *, struct xfs_inode *); int xfs_itruncate_extents_flags(struct xfs_trans **, struct xfs_inode *, int, xfs_fsize_t, int); void xfs_iext_realloc(xfs_inode_t *, int, int); int xfs_log_force_inode(struct xfs_inode *ip); void xfs_iunpin_wait(xfs_inode_t *); #define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) int xfs_iflush_cluster(struct xfs_buf *); void xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode, struct xfs_inode *ip1, uint ip1_mode); int xfs_icreate(struct xfs_trans *tp, xfs_ino_t ino, const struct xfs_icreate_args *args, struct xfs_inode **ipp); static inline int xfs_itruncate_extents( struct xfs_trans **tpp, struct xfs_inode *ip, int whichfork, xfs_fsize_t new_size) { return xfs_itruncate_extents_flags(tpp, ip, whichfork, new_size, 0); } int xfs_break_dax_layouts(struct inode *inode, bool *retry); int xfs_break_layouts(struct inode *inode, uint *iolock, enum layout_break_reason reason); static inline void xfs_update_stable_writes(struct xfs_inode *ip) { if (bdev_stable_writes(xfs_inode_buftarg(ip)->bt_bdev)) mapping_set_stable_writes(VFS_I(ip)->i_mapping); else mapping_clear_stable_writes(VFS_I(ip)->i_mapping); } /* * When setting up a newly allocated inode, we need to call * xfs_finish_inode_setup() once the inode is fully instantiated at * the VFS level to prevent the rest of the world seeing the inode * before we've completed instantiation. Otherwise we can do it * the moment the inode lookup is complete. */ static inline void xfs_finish_inode_setup(struct xfs_inode *ip) { xfs_iflags_clear(ip, XFS_INEW); barrier(); unlock_new_inode(VFS_I(ip)); } static inline void xfs_setup_existing_inode(struct xfs_inode *ip) { xfs_setup_inode(ip); xfs_setup_iops(ip); xfs_finish_inode_setup(ip); } void xfs_irele(struct xfs_inode *ip); extern struct kmem_cache *xfs_inode_cache; /* The default CoW extent size hint. */ #define XFS_DEFAULT_COWEXTSZ_HINT 32 bool xfs_inode_needs_inactive(struct xfs_inode *ip); struct xfs_inode *xfs_iunlink_lookup(struct xfs_perag *pag, xfs_agino_t agino); int xfs_iunlink_reload_next(struct xfs_trans *tp, struct xfs_buf *agibp, xfs_agino_t prev_agino, xfs_agino_t next_agino); void xfs_end_io(struct work_struct *work); int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); void xfs_iunlock2_remapping(struct xfs_inode *ip1, struct xfs_inode *ip2); void xfs_lock_inodes(struct xfs_inode **ips, int inodes, uint lock_mode); void xfs_sort_inodes(struct xfs_inode **i_tab, unsigned int num_inodes); static inline bool xfs_inode_unlinked_incomplete( const struct xfs_inode *ip) { return VFS_IC(ip)->i_nlink == 0 && !xfs_inode_on_unlinked_list(ip); } int xfs_inode_reload_unlinked_bucket(struct xfs_trans *tp, struct xfs_inode *ip); int xfs_inode_reload_unlinked(struct xfs_inode *ip); bool xfs_ifork_zapped(const struct xfs_inode *ip, int whichfork); void xfs_inode_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, xfs_filblks_t *dblocks, xfs_filblks_t *rblocks); unsigned int xfs_inode_alloc_unitsize(struct xfs_inode *ip); int xfs_icreate_dqalloc(const struct xfs_icreate_args *args, struct xfs_dquot **udqpp, struct xfs_dquot **gdqpp, struct xfs_dquot **pdqpp); #endif /* __XFS_INODE_H__ */ |
| 419 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_CURRENT_H #define _ASM_X86_CURRENT_H #include <linux/build_bug.h> #include <linux/compiler.h> #ifndef __ASSEMBLY__ #include <linux/cache.h> #include <asm/percpu.h> struct task_struct; struct pcpu_hot { union { struct { struct task_struct *current_task; int preempt_count; int cpu_number; #ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING u64 call_depth; #endif unsigned long top_of_stack; void *hardirq_stack_ptr; u16 softirq_pending; #ifdef CONFIG_X86_64 bool hardirq_stack_inuse; #else void *softirq_stack_ptr; #endif }; u8 pad[64]; }; }; static_assert(sizeof(struct pcpu_hot) == 64); DECLARE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot); /* const-qualified alias to pcpu_hot, aliased by linker. */ DECLARE_PER_CPU_ALIGNED(const struct pcpu_hot __percpu_seg_override, const_pcpu_hot); static __always_inline struct task_struct *get_current(void) { if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT)) return this_cpu_read_const(const_pcpu_hot.current_task); return this_cpu_read_stable(pcpu_hot.current_task); } #define current get_current() #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_CURRENT_H */ |
| 7 260 260 260 262 262 262 260 260 256 18 262 262 262 7 7 7 7 7 7 7 7 7 261 7 7 7 262 261 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 | // SPDX-License-Identifier: GPL-2.0 /* * FPU signal frame handling routines. */ #include <linux/compat.h> #include <linux/cpu.h> #include <linux/pagemap.h> #include <asm/fpu/signal.h> #include <asm/fpu/regset.h> #include <asm/fpu/xstate.h> #include <asm/sigframe.h> #include <asm/trapnr.h> #include <asm/trace/fpu.h> #include "context.h" #include "internal.h" #include "legacy.h" #include "xstate.h" /* * Check for the presence of extended state information in the * user fpstate pointer in the sigcontext. */ static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf, struct _fpx_sw_bytes *fx_sw) { int min_xstate_size = sizeof(struct fxregs_state) + sizeof(struct xstate_header); void __user *fpstate = fxbuf; unsigned int magic2; if (__copy_from_user(fx_sw, &fxbuf->sw_reserved[0], sizeof(*fx_sw))) return false; /* Check for the first magic field and other error scenarios. */ if (fx_sw->magic1 != FP_XSTATE_MAGIC1 || fx_sw->xstate_size < min_xstate_size || fx_sw->xstate_size > current->thread.fpu.fpstate->user_size || fx_sw->xstate_size > fx_sw->extended_size) goto setfx; /* * Check for the presence of second magic word at the end of memory * layout. This detects the case where the user just copied the legacy * fpstate layout with out copying the extended state information * in the memory layout. */ if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size))) return false; if (likely(magic2 == FP_XSTATE_MAGIC2)) return true; setfx: trace_x86_fpu_xstate_check_failed(¤t->thread.fpu); /* Set the parameters for fx only state */ fx_sw->magic1 = 0; fx_sw->xstate_size = sizeof(struct fxregs_state); fx_sw->xfeatures = XFEATURE_MASK_FPSSE; return true; } /* * Signal frame handlers. */ static inline bool save_fsave_header(struct task_struct *tsk, void __user *buf) { if (use_fxsr()) { struct xregs_state *xsave = &tsk->thread.fpu.fpstate->regs.xsave; struct user_i387_ia32_struct env; struct _fpstate_32 __user *fp = buf; fpregs_lock(); if (!test_thread_flag(TIF_NEED_FPU_LOAD)) fxsave(&tsk->thread.fpu.fpstate->regs.fxsave); fpregs_unlock(); convert_from_fxsr(&env, tsk); if (__copy_to_user(buf, &env, sizeof(env)) || __put_user(xsave->i387.swd, &fp->status) || __put_user(X86_FXSR_MAGIC, &fp->magic)) return false; } else { struct fregs_state __user *fp = buf; u32 swd; if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status)) return false; } return true; } /* * Prepare the SW reserved portion of the fxsave memory layout, indicating * the presence of the extended state information in the memory layout * pointed to by the fpstate pointer in the sigcontext. * This is saved when ever the FP and extended state context is * saved on the user stack during the signal handler delivery to the user. */ static inline void save_sw_bytes(struct _fpx_sw_bytes *sw_bytes, bool ia32_frame, struct fpstate *fpstate) { sw_bytes->magic1 = FP_XSTATE_MAGIC1; sw_bytes->extended_size = fpstate->user_size + FP_XSTATE_MAGIC2_SIZE; sw_bytes->xfeatures = fpstate->user_xfeatures; sw_bytes->xstate_size = fpstate->user_size; if (ia32_frame) sw_bytes->extended_size += sizeof(struct fregs_state); } static inline bool save_xstate_epilog(void __user *buf, int ia32_frame, struct fpstate *fpstate) { struct xregs_state __user *x = buf; struct _fpx_sw_bytes sw_bytes = {}; u32 xfeatures; int err; /* Setup the bytes not touched by the [f]xsave and reserved for SW. */ save_sw_bytes(&sw_bytes, ia32_frame, fpstate); err = __copy_to_user(&x->i387.sw_reserved, &sw_bytes, sizeof(sw_bytes)); if (!use_xsave()) return !err; err |= __put_user(FP_XSTATE_MAGIC2, (__u32 __user *)(buf + fpstate->user_size)); /* * Read the xfeatures which we copied (directly from the cpu or * from the state in task struct) to the user buffers. */ err |= __get_user(xfeatures, (__u32 __user *)&x->header.xfeatures); /* * For legacy compatible, we always set FP/SSE bits in the bit * vector while saving the state to the user context. This will * enable us capturing any changes(during sigreturn) to * the FP/SSE bits by the legacy applications which don't touch * xfeatures in the xsave header. * * xsave aware apps can change the xfeatures in the xsave * header as well as change any contents in the memory layout. * xrestore as part of sigreturn will capture all the changes. */ xfeatures |= XFEATURE_MASK_FPSSE; err |= __put_user(xfeatures, (__u32 __user *)&x->header.xfeatures); return !err; } static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf, u32 pkru) { if (use_xsave()) return xsave_to_user_sigframe(buf, pkru); if (use_fxsr()) return fxsave_to_user_sigframe((struct fxregs_state __user *) buf); else return fnsave_to_user_sigframe((struct fregs_state __user *) buf); } /* * Save the fpu, extended register state to the user signal frame. * * 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save * state is copied. * 'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'. * * buf == buf_fx for 64-bit frames and 32-bit fsave frame. * buf != buf_fx for 32-bit frames with fxstate. * * Save it directly to the user frame with disabled page fault handler. If * that faults, try to clear the frame which handles the page fault. * * If this is a 32-bit frame with fxstate, put a fsave header before * the aligned state at 'buf_fx'. * * For [f]xsave state, update the SW reserved fields in the [f]xsave frame * indicating the absence/presence of the extended state to the user. */ bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size, u32 pkru) { struct task_struct *tsk = current; struct fpstate *fpstate = tsk->thread.fpu.fpstate; bool ia32_fxstate = (buf != buf_fx); int ret; ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) || IS_ENABLED(CONFIG_IA32_EMULATION)); if (!static_cpu_has(X86_FEATURE_FPU)) { struct user_i387_ia32_struct fp; fpregs_soft_get(current, NULL, (struct membuf){.p = &fp, .left = sizeof(fp)}); return !copy_to_user(buf, &fp, sizeof(fp)); } if (!access_ok(buf, size)) return false; if (use_xsave()) { struct xregs_state __user *xbuf = buf_fx; /* * Clear the xsave header first, so that reserved fields are * initialized to zero. */ if (__clear_user(&xbuf->header, sizeof(xbuf->header))) return false; } retry: /* * Load the FPU registers if they are not valid for the current task. * With a valid FPU state we can attempt to save the state directly to * userland's stack frame which will likely succeed. If it does not, * resolve the fault in the user memory and try again. */ fpregs_lock(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) fpregs_restore_userregs(); pagefault_disable(); ret = copy_fpregs_to_sigframe(buf_fx, pkru); pagefault_enable(); fpregs_unlock(); if (ret) { if (!__clear_user(buf_fx, fpstate->user_size)) goto retry; return false; } /* Save the fsave header for the 32-bit frames. */ if ((ia32_fxstate || !use_fxsr()) && !save_fsave_header(tsk, buf)) return false; if (use_fxsr() && !save_xstate_epilog(buf_fx, ia32_fxstate, fpstate)) return false; return true; } static int __restore_fpregs_from_user(void __user *buf, u64 ufeatures, u64 xrestore, bool fx_only) { if (use_xsave()) { u64 init_bv = ufeatures & ~xrestore; int ret; if (likely(!fx_only)) ret = xrstor_from_user_sigframe(buf, xrestore); else ret = fxrstor_from_user_sigframe(buf); if (!ret && unlikely(init_bv)) os_xrstor(&init_fpstate, init_bv); return ret; } else if (use_fxsr()) { return fxrstor_from_user_sigframe(buf); } else { return frstor_from_user_sigframe(buf); } } /* * Attempt to restore the FPU registers directly from user memory. * Pagefaults are handled and any errors returned are fatal. */ static bool restore_fpregs_from_user(void __user *buf, u64 xrestore, bool fx_only) { struct fpu *fpu = ¤t->thread.fpu; int ret; /* Restore enabled features only. */ xrestore &= fpu->fpstate->user_xfeatures; retry: fpregs_lock(); /* Ensure that XFD is up to date */ xfd_update_state(fpu->fpstate); pagefault_disable(); ret = __restore_fpregs_from_user(buf, fpu->fpstate->user_xfeatures, xrestore, fx_only); pagefault_enable(); if (unlikely(ret)) { /* * The above did an FPU restore operation, restricted to * the user portion of the registers, and failed, but the * microcode might have modified the FPU registers * nevertheless. * * If the FPU registers do not belong to current, then * invalidate the FPU register state otherwise the task * might preempt current and return to user space with * corrupted FPU registers. */ if (test_thread_flag(TIF_NEED_FPU_LOAD)) __cpu_invalidate_fpregs_state(); fpregs_unlock(); /* Try to handle #PF, but anything else is fatal. */ if (ret != X86_TRAP_PF) return false; if (!fault_in_readable(buf, fpu->fpstate->user_size)) goto retry; return false; } /* * Restore supervisor states: previous context switch etc has done * XSAVES and saved the supervisor states in the kernel buffer from * which they can be restored now. * * It would be optimal to handle this with a single XRSTORS, but * this does not work because the rest of the FPU registers have * been restored from a user buffer directly. */ if (test_thread_flag(TIF_NEED_FPU_LOAD) && xfeatures_mask_supervisor()) os_xrstor_supervisor(fpu->fpstate); fpregs_mark_activate(); fpregs_unlock(); return true; } static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, bool ia32_fxstate) { struct task_struct *tsk = current; struct fpu *fpu = &tsk->thread.fpu; struct user_i387_ia32_struct env; bool success, fx_only = false; union fpregs_state *fpregs; u64 user_xfeatures = 0; if (use_xsave()) { struct _fpx_sw_bytes fx_sw_user; if (!check_xstate_in_sigframe(buf_fx, &fx_sw_user)) return false; fx_only = !fx_sw_user.magic1; user_xfeatures = fx_sw_user.xfeatures; } else { user_xfeatures = XFEATURE_MASK_FPSSE; } if (likely(!ia32_fxstate)) { /* Restore the FPU registers directly from user memory. */ return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only); } /* * Copy the legacy state because the FP portion of the FX frame has * to be ignored for histerical raisins. The legacy state is folded * in once the larger state has been copied. */ if (__copy_from_user(&env, buf, sizeof(env))) return false; /* * By setting TIF_NEED_FPU_LOAD it is ensured that our xstate is * not modified on context switch and that the xstate is considered * to be loaded again on return to userland (overriding last_cpu avoids * the optimisation). */ fpregs_lock(); if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { /* * If supervisor states are available then save the * hardware state in current's fpstate so that the * supervisor state is preserved. Save the full state for * simplicity. There is no point in optimizing this by only * saving the supervisor states and then shuffle them to * the right place in memory. It's ia32 mode. Shrug. */ if (xfeatures_mask_supervisor()) os_xsave(fpu->fpstate); set_thread_flag(TIF_NEED_FPU_LOAD); } __fpu_invalidate_fpregs_state(fpu); __cpu_invalidate_fpregs_state(); fpregs_unlock(); fpregs = &fpu->fpstate->regs; if (use_xsave() && !fx_only) { if (copy_sigframe_from_user_to_xstate(tsk, buf_fx)) return false; } else { if (__copy_from_user(&fpregs->fxsave, buf_fx, sizeof(fpregs->fxsave))) return false; if (IS_ENABLED(CONFIG_X86_64)) { /* Reject invalid MXCSR values. */ if (fpregs->fxsave.mxcsr & ~mxcsr_feature_mask) return false; } else { /* Mask invalid bits out for historical reasons (broken hardware). */ fpregs->fxsave.mxcsr &= mxcsr_feature_mask; } /* Enforce XFEATURE_MASK_FPSSE when XSAVE is enabled */ if (use_xsave()) fpregs->xsave.header.xfeatures |= XFEATURE_MASK_FPSSE; } /* Fold the legacy FP storage */ convert_to_fxsr(&fpregs->fxsave, &env); fpregs_lock(); if (use_xsave()) { /* * Remove all UABI feature bits not set in user_xfeatures * from the memory xstate header which makes the full * restore below bring them into init state. This works for * fx_only mode as well because that has only FP and SSE * set in user_xfeatures. * * Preserve supervisor states! */ u64 mask = user_xfeatures | xfeatures_mask_supervisor(); fpregs->xsave.header.xfeatures &= mask; success = !os_xrstor_safe(fpu->fpstate, fpu_kernel_cfg.max_features); } else { success = !fxrstor_safe(&fpregs->fxsave); } if (likely(success)) fpregs_mark_activate(); fpregs_unlock(); return success; } static inline unsigned int xstate_sigframe_size(struct fpstate *fpstate) { unsigned int size = fpstate->user_size; return use_xsave() ? size + FP_XSTATE_MAGIC2_SIZE : size; } /* * Restore FPU state from a sigframe: */ bool fpu__restore_sig(void __user *buf, int ia32_frame) { struct fpu *fpu = ¤t->thread.fpu; void __user *buf_fx = buf; bool ia32_fxstate = false; bool success = false; unsigned int size; if (unlikely(!buf)) { fpu__clear_user_states(fpu); return true; } size = xstate_sigframe_size(fpu->fpstate); ia32_frame &= (IS_ENABLED(CONFIG_X86_32) || IS_ENABLED(CONFIG_IA32_EMULATION)); /* * Only FXSR enabled systems need the FX state quirk. * FRSTOR does not need it and can use the fast path. */ if (ia32_frame && use_fxsr()) { buf_fx = buf + sizeof(struct fregs_state); size += sizeof(struct fregs_state); ia32_fxstate = true; } if (!access_ok(buf, size)) goto out; if (!IS_ENABLED(CONFIG_X86_64) && !cpu_feature_enabled(X86_FEATURE_FPU)) { success = !fpregs_soft_set(current, NULL, 0, sizeof(struct user_i387_ia32_struct), NULL, buf); } else { success = __fpu_restore_sig(buf, buf_fx, ia32_fxstate); } out: if (unlikely(!success)) fpu__clear_user_states(fpu); return success; } unsigned long fpu__alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long *buf_fx, unsigned long *size) { unsigned long frame_size = xstate_sigframe_size(current->thread.fpu.fpstate); *buf_fx = sp = round_down(sp - frame_size, 64); if (ia32_frame && use_fxsr()) { frame_size += sizeof(struct fregs_state); sp -= sizeof(struct fregs_state); } *size = frame_size; return sp; } unsigned long __init fpu__get_fpstate_size(void) { unsigned long ret = fpu_user_cfg.max_size; if (use_xsave()) ret += FP_XSTATE_MAGIC2_SIZE; /* * This space is needed on (most) 32-bit kernels, or when a 32-bit * app is running on a 64-bit kernel. To keep things simple, just * assume the worst case and always include space for 'freg_state', * even for 64-bit apps on 64-bit kernels. This wastes a bit of * space, but keeps the code simple. */ if ((IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) && use_fxsr()) ret += sizeof(struct fregs_state); return ret; } |
| 437 300 487 440 440 437 440 440 440 347 29 347 58 59 47 47 47 13 47 60 365 368 346 301 46 29 347 367 368 301 301 301 300 2 3 3 376 376 45 3 41 51 51 51 51 51 48 51 51 51 51 1291 1253 51 153 2 24 1031 1031 350 1185 1033 1037 1033 1033 1031 1030 1031 1037 1033 173 174 173 24 153 174 174 6 8 350 353 352 354 1 1 1 1 1 1 1 1 3 3 301 300 301 53 54 3 52 54 52 52 52 51 3 52 51 3 5 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 | /* * mm/rmap.c - physical to virtual reverse mappings * * Copyright 2001, Rik van Riel <riel@conectiva.com.br> * Released under the General Public License (GPL). * * Simple, low overhead reverse mapping scheme. * Please try to keep this thing as modular as possible. * * Provides methods for unmapping each kind of mapped page: * the anon methods track anonymous pages, and * the file methods track pages belonging to an inode. * * Original design by Rik van Riel <riel@conectiva.com.br> 2001 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 * Contributions by Hugh Dickins 2003, 2004 */ /* * Lock ordering in mm: * * inode->i_rwsem (while writing or truncating, not reading or faulting) * mm->mmap_lock * mapping->invalidate_lock (in filemap_fault) * folio_lock * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below) * vma_start_write * mapping->i_mmap_rwsem * anon_vma->rwsem * mm->page_table_lock or pte_lock * swap_lock (in swap_duplicate, swap_info_get) * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in block_dirty_folio) * i_pages lock (widely used) * lruvec->lru_lock (in folio_lruvec_lock_irq) * inode->i_lock (in set_page_dirty's __mark_inode_dirty) * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) * sb_lock (within inode_lock in fs/fs-writeback.c) * i_pages lock (widely used, in set_page_dirty, * in arch-dependent flush_dcache_mmap_lock, * within bdi.wb->list_lock in __sync_single_inode) * * anon_vma->rwsem,mapping->i_mmap_rwsem (memory_failure, collect_procs_anon) * ->tasklist_lock * pte map lock * * hugetlbfs PageHuge() take locks in this order: * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) * vma_lock (hugetlb specific lock for pmd_sharing) * mapping->i_mmap_rwsem (also used for hugetlb pmd sharing) * folio_lock */ #include <linux/mm.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/rcupdate.h> #include <linux/export.h> #include <linux/memcontrol.h> #include <linux/mmu_notifier.h> #include <linux/migrate.h> #include <linux/hugetlb.h> #include <linux/huge_mm.h> #include <linux/backing-dev.h> #include <linux/page_idle.h> #include <linux/memremap.h> #include <linux/userfaultfd_k.h> #include <linux/mm_inline.h> #include <linux/oom.h> #include <asm/tlbflush.h> #define CREATE_TRACE_POINTS #include <trace/events/tlb.h> #include <trace/events/migrate.h> #include "internal.h" static struct kmem_cache *anon_vma_cachep; static struct kmem_cache *anon_vma_chain_cachep; static inline struct anon_vma *anon_vma_alloc(void) { struct anon_vma *anon_vma; anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); if (anon_vma) { atomic_set(&anon_vma->refcount, 1); anon_vma->num_children = 0; anon_vma->num_active_vmas = 0; anon_vma->parent = anon_vma; /* * Initialise the anon_vma root to point to itself. If called * from fork, the root will be reset to the parents anon_vma. */ anon_vma->root = anon_vma; } return anon_vma; } static inline void anon_vma_free(struct anon_vma *anon_vma) { VM_BUG_ON(atomic_read(&anon_vma->refcount)); /* * Synchronize against folio_lock_anon_vma_read() such that * we can safely hold the lock without the anon_vma getting * freed. * * Relies on the full mb implied by the atomic_dec_and_test() from * put_anon_vma() against the acquire barrier implied by * down_read_trylock() from folio_lock_anon_vma_read(). This orders: * * folio_lock_anon_vma_read() VS put_anon_vma() * down_read_trylock() atomic_dec_and_test() * LOCK MB * atomic_read() rwsem_is_locked() * * LOCK should suffice since the actual taking of the lock must * happen _before_ what follows. */ might_sleep(); if (rwsem_is_locked(&anon_vma->root->rwsem)) { anon_vma_lock_write(anon_vma); anon_vma_unlock_write(anon_vma); } kmem_cache_free(anon_vma_cachep, anon_vma); } static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp) { return kmem_cache_alloc(anon_vma_chain_cachep, gfp); } static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) { kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); } static void anon_vma_chain_link(struct vm_area_struct *vma, struct anon_vma_chain *avc, struct anon_vma *anon_vma) { avc->vma = vma; avc->anon_vma = anon_vma; list_add(&avc->same_vma, &vma->anon_vma_chain); anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); } /** * __anon_vma_prepare - attach an anon_vma to a memory region * @vma: the memory region in question * * This makes sure the memory mapping described by 'vma' has * an 'anon_vma' attached to it, so that we can associate the * anonymous pages mapped into it with that anon_vma. * * The common case will be that we already have one, which * is handled inline by anon_vma_prepare(). But if * not we either need to find an adjacent mapping that we * can re-use the anon_vma from (very common when the only * reason for splitting a vma has been mprotect()), or we * allocate a new one. * * Anon-vma allocations are very subtle, because we may have * optimistically looked up an anon_vma in folio_lock_anon_vma_read() * and that may actually touch the rwsem even in the newly * allocated vma (it depends on RCU to make sure that the * anon_vma isn't actually destroyed). * * As a result, we need to do proper anon_vma locking even * for the new allocation. At the same time, we do not want * to do any locking for the common case of already having * an anon_vma. */ int __anon_vma_prepare(struct vm_area_struct *vma) { struct mm_struct *mm = vma->vm_mm; struct anon_vma *anon_vma, *allocated; struct anon_vma_chain *avc; mmap_assert_locked(mm); might_sleep(); avc = anon_vma_chain_alloc(GFP_KERNEL); if (!avc) goto out_enomem; anon_vma = find_mergeable_anon_vma(vma); allocated = NULL; if (!anon_vma) { anon_vma = anon_vma_alloc(); if (unlikely(!anon_vma)) goto out_enomem_free_avc; anon_vma->num_children++; /* self-parent link for new root */ allocated = anon_vma; } anon_vma_lock_write(anon_vma); /* page_table_lock to protect against threads */ spin_lock(&mm->page_table_lock); if (likely(!vma->anon_vma)) { vma->anon_vma = anon_vma; anon_vma_chain_link(vma, avc, anon_vma); anon_vma->num_active_vmas++; allocated = NULL; avc = NULL; } spin_unlock(&mm->page_table_lock); anon_vma_unlock_write(anon_vma); if (unlikely(allocated)) put_anon_vma(allocated); if (unlikely(avc)) anon_vma_chain_free(avc); return 0; out_enomem_free_avc: anon_vma_chain_free(avc); out_enomem: return -ENOMEM; } /* * This is a useful helper function for locking the anon_vma root as * we traverse the vma->anon_vma_chain, looping over anon_vma's that * have the same vma. * * Such anon_vma's should have the same root, so you'd expect to see * just a single mutex_lock for the whole traversal. */ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma) { struct anon_vma *new_root = anon_vma->root; if (new_root != root) { if (WARN_ON_ONCE(root)) up_write(&root->rwsem); root = new_root; down_write(&root->rwsem); } return root; } static inline void unlock_anon_vma_root(struct anon_vma *root) { if (root) up_write(&root->rwsem); } /* * Attach the anon_vmas from src to dst. * Returns 0 on success, -ENOMEM on failure. * * anon_vma_clone() is called by vma_expand(), vma_merge(), __split_vma(), * copy_vma() and anon_vma_fork(). The first four want an exact copy of src, * while the last one, anon_vma_fork(), may try to reuse an existing anon_vma to * prevent endless growth of anon_vma. Since dst->anon_vma is set to NULL before * call, we can identify this case by checking (!dst->anon_vma && * src->anon_vma). * * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find * and reuse existing anon_vma which has no vmas and only one child anon_vma. * This prevents degradation of anon_vma hierarchy to endless linear chain in * case of constantly forking task. On the other hand, an anon_vma with more * than one child isn't reused even if there was no alive vma, thus rmap * walker has a good chance of avoiding scanning the whole hierarchy when it * searches where page is mapped. */ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) { struct anon_vma_chain *avc, *pavc; struct anon_vma *root = NULL; list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { struct anon_vma *anon_vma; avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN); if (unlikely(!avc)) { unlock_anon_vma_root(root); root = NULL; avc = anon_vma_chain_alloc(GFP_KERNEL); if (!avc) goto enomem_failure; } anon_vma = pavc->anon_vma; root = lock_anon_vma_root(root, anon_vma); anon_vma_chain_link(dst, avc, anon_vma); /* * Reuse existing anon_vma if it has no vma and only one * anon_vma child. * * Root anon_vma is never reused: * it has self-parent reference and at least one child. */ if (!dst->anon_vma && src->anon_vma && anon_vma->num_children < 2 && anon_vma->num_active_vmas == 0) dst->anon_vma = anon_vma; } if (dst->anon_vma) dst->anon_vma->num_active_vmas++; unlock_anon_vma_root(root); return 0; enomem_failure: /* * dst->anon_vma is dropped here otherwise its num_active_vmas can * be incorrectly decremented in unlink_anon_vmas(). * We can safely do this because callers of anon_vma_clone() don't care * about dst->anon_vma if anon_vma_clone() failed. */ dst->anon_vma = NULL; unlink_anon_vmas(dst); return -ENOMEM; } /* * Attach vma to its own anon_vma, as well as to the anon_vmas that * the corresponding VMA in the parent process is attached to. * Returns 0 on success, non-zero on failure. */ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) { struct anon_vma_chain *avc; struct anon_vma *anon_vma; int error; /* Don't bother if the parent process has no anon_vma here. */ if (!pvma->anon_vma) return 0; /* Drop inherited anon_vma, we'll reuse existing or allocate new. */ vma->anon_vma = NULL; /* * First, attach the new VMA to the parent VMA's anon_vmas, * so rmap can find non-COWed pages in child processes. */ error = anon_vma_clone(vma, pvma); if (error) return error; /* An existing anon_vma has been reused, all done then. */ if (vma->anon_vma) return 0; /* Then add our own anon_vma. */ anon_vma = anon_vma_alloc(); if (!anon_vma) goto out_error; anon_vma->num_active_vmas++; avc = anon_vma_chain_alloc(GFP_KERNEL); if (!avc) goto out_error_free_anon_vma; /* * The root anon_vma's rwsem is the lock actually used when we * lock any of the anon_vmas in this anon_vma tree. */ anon_vma->root = pvma->anon_vma->root; anon_vma->parent = pvma->anon_vma; /* * With refcounts, an anon_vma can stay around longer than the * process it belongs to. The root anon_vma needs to be pinned until * this anon_vma is freed, because the lock lives in the root. */ get_anon_vma(anon_vma->root); /* Mark this anon_vma as the one where our new (COWed) pages go. */ vma->anon_vma = anon_vma; anon_vma_lock_write(anon_vma); anon_vma_chain_link(vma, avc, anon_vma); anon_vma->parent->num_children++; anon_vma_unlock_write(anon_vma); return 0; out_error_free_anon_vma: put_anon_vma(anon_vma); out_error: unlink_anon_vmas(vma); return -ENOMEM; } void unlink_anon_vmas(struct vm_area_struct *vma) { struct anon_vma_chain *avc, *next; struct anon_vma *root = NULL; /* * Unlink each anon_vma chained to the VMA. This list is ordered * from newest to oldest, ensuring the root anon_vma gets freed last. */ list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { struct anon_vma *anon_vma = avc->anon_vma; root = lock_anon_vma_root(root, anon_vma); anon_vma_interval_tree_remove(avc, &anon_vma->rb_root); /* * Leave empty anon_vmas on the list - we'll need * to free them outside the lock. */ if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) { anon_vma->parent->num_children--; continue; } list_del(&avc->same_vma); anon_vma_chain_free(avc); } if (vma->anon_vma) { vma->anon_vma->num_active_vmas--; /* * vma would still be needed after unlink, and anon_vma will be prepared * when handle fault. */ vma->anon_vma = NULL; } unlock_anon_vma_root(root); /* * Iterate the list once more, it now only contains empty and unlinked * anon_vmas, destroy them. Could not do before due to __put_anon_vma() * needing to write-acquire the anon_vma->root->rwsem. */ list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { struct anon_vma *anon_vma = avc->anon_vma; VM_WARN_ON(anon_vma->num_children); VM_WARN_ON(anon_vma->num_active_vmas); put_anon_vma(anon_vma); list_del(&avc->same_vma); anon_vma_chain_free(avc); } } static void anon_vma_ctor(void *data) { struct anon_vma *anon_vma = data; init_rwsem(&anon_vma->rwsem); atomic_set(&anon_vma->refcount, 0); anon_vma->rb_root = RB_ROOT_CACHED; } void __init anon_vma_init(void) { anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT, anon_vma_ctor); anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC|SLAB_ACCOUNT); } /* * Getting a lock on a stable anon_vma from a page off the LRU is tricky! * * Since there is no serialization what so ever against folio_remove_rmap_*() * the best this function can do is return a refcount increased anon_vma * that might have been relevant to this page. * * The page might have been remapped to a different anon_vma or the anon_vma * returned may already be freed (and even reused). * * In case it was remapped to a different anon_vma, the new anon_vma will be a * child of the old anon_vma, and the anon_vma lifetime rules will therefore * ensure that any anon_vma obtained from the page will still be valid for as * long as we observe page_mapped() [ hence all those page_mapped() tests ]. * * All users of this function must be very careful when walking the anon_vma * chain and verify that the page in question is indeed mapped in it * [ something equivalent to page_mapped_in_vma() ]. * * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from * folio_remove_rmap_*() that the anon_vma pointer from page->mapping is valid * if there is a mapcount, we can dereference the anon_vma after observing * those. * * NOTE: the caller should normally hold folio lock when calling this. If * not, the caller needs to double check the anon_vma didn't change after * taking the anon_vma lock for either read or write (UFFDIO_MOVE can modify it * concurrently without folio lock protection). See folio_lock_anon_vma_read() * which has already covered that, and comment above remap_pages(). */ struct anon_vma *folio_get_anon_vma(const struct folio *folio) { struct anon_vma *anon_vma = NULL; unsigned long anon_mapping; rcu_read_lock(); anon_mapping = (unsigned long)READ_ONCE(folio->mapping); if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) goto out; if (!folio_mapped(folio)) goto out; anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); if (!atomic_inc_not_zero(&anon_vma->refcount)) { anon_vma = NULL; goto out; } /* * If this folio is still mapped, then its anon_vma cannot have been * freed. But if it has been unmapped, we have no security against the * anon_vma structure being freed and reused (for another anon_vma: * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero() * above cannot corrupt). */ if (!folio_mapped(folio)) { rcu_read_unlock(); put_anon_vma(anon_vma); return NULL; } out: rcu_read_unlock(); return anon_vma; } /* * Similar to folio_get_anon_vma() except it locks the anon_vma. * * Its a little more complex as it tries to keep the fast path to a single * atomic op -- the trylock. If we fail the trylock, we fall back to getting a * reference like with folio_get_anon_vma() and then block on the mutex * on !rwc->try_lock case. */ struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio, struct rmap_walk_control *rwc) { struct anon_vma *anon_vma = NULL; struct anon_vma *root_anon_vma; unsigned long anon_mapping; retry: rcu_read_lock(); anon_mapping = (unsigned long)READ_ONCE(folio->mapping); if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) goto out; if (!folio_mapped(folio)) goto out; anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); root_anon_vma = READ_ONCE(anon_vma->root); if (down_read_trylock(&root_anon_vma->rwsem)) { /* * folio_move_anon_rmap() might have changed the anon_vma as we * might not hold the folio lock here. */ if (unlikely((unsigned long)READ_ONCE(folio->mapping) != anon_mapping)) { up_read(&root_anon_vma->rwsem); rcu_read_unlock(); goto retry; } /* * If the folio is still mapped, then this anon_vma is still * its anon_vma, and holding the mutex ensures that it will * not go away, see anon_vma_free(). */ if (!folio_mapped(folio)) { up_read(&root_anon_vma->rwsem); anon_vma = NULL; } goto out; } if (rwc && rwc->try_lock) { anon_vma = NULL; rwc->contended = true; goto out; } /* trylock failed, we got to sleep */ if (!atomic_inc_not_zero(&anon_vma->refcount)) { anon_vma = NULL; goto out; } if (!folio_mapped(folio)) { rcu_read_unlock(); put_anon_vma(anon_vma); return NULL; } /* we pinned the anon_vma, its safe to sleep */ rcu_read_unlock(); anon_vma_lock_read(anon_vma); /* * folio_move_anon_rmap() might have changed the anon_vma as we might * not hold the folio lock here. */ if (unlikely((unsigned long)READ_ONCE(folio->mapping) != anon_mapping)) { anon_vma_unlock_read(anon_vma); put_anon_vma(anon_vma); anon_vma = NULL; goto retry; } if (atomic_dec_and_test(&anon_vma->refcount)) { /* * Oops, we held the last refcount, release the lock * and bail -- can't simply use put_anon_vma() because * we'll deadlock on the anon_vma_lock_write() recursion. */ anon_vma_unlock_read(anon_vma); __put_anon_vma(anon_vma); anon_vma = NULL; } return anon_vma; out: rcu_read_unlock(); return anon_vma; } #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH /* * Flush TLB entries for recently unmapped pages from remote CPUs. It is * important if a PTE was dirty when it was unmapped that it's flushed * before any IO is initiated on the page to prevent lost writes. Similarly, * it must be flushed before freeing to prevent data leakage. */ void try_to_unmap_flush(void) { struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; if (!tlb_ubc->flush_required) return; arch_tlbbatch_flush(&tlb_ubc->arch); tlb_ubc->flush_required = false; tlb_ubc->writable = false; } /* Flush iff there are potentially writable TLB entries that can race with IO */ void try_to_unmap_flush_dirty(void) { struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; if (tlb_ubc->writable) try_to_unmap_flush(); } /* * Bits 0-14 of mm->tlb_flush_batched record pending generations. * Bits 16-30 of mm->tlb_flush_batched bit record flushed generations. */ #define TLB_FLUSH_BATCH_FLUSHED_SHIFT 16 #define TLB_FLUSH_BATCH_PENDING_MASK \ ((1 << (TLB_FLUSH_BATCH_FLUSHED_SHIFT - 1)) - 1) #define TLB_FLUSH_BATCH_PENDING_LARGE \ (TLB_FLUSH_BATCH_PENDING_MASK / 2) static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval, unsigned long uaddr) { struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; int batch; bool writable = pte_dirty(pteval); if (!pte_accessible(mm, pteval)) return; arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, uaddr); tlb_ubc->flush_required = true; /* * Ensure compiler does not re-order the setting of tlb_flush_batched * before the PTE is cleared. */ barrier(); batch = atomic_read(&mm->tlb_flush_batched); retry: if ((batch & TLB_FLUSH_BATCH_PENDING_MASK) > TLB_FLUSH_BATCH_PENDING_LARGE) { /* * Prevent `pending' from catching up with `flushed' because of * overflow. Reset `pending' and `flushed' to be 1 and 0 if * `pending' becomes large. */ if (!atomic_try_cmpxchg(&mm->tlb_flush_batched, &batch, 1)) goto retry; } else { atomic_inc(&mm->tlb_flush_batched); } /* * If the PTE was dirty then it's best to assume it's writable. The * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush() * before the page is queued for IO. */ if (writable) tlb_ubc->writable = true; } /* * Returns true if the TLB flush should be deferred to the end of a batch of * unmap operations to reduce IPIs. */ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) { if (!(flags & TTU_BATCH_FLUSH)) return false; return arch_tlbbatch_should_defer(mm); } /* * Reclaim unmaps pages under the PTL but do not flush the TLB prior to * releasing the PTL if TLB flushes are batched. It's possible for a parallel * operation such as mprotect or munmap to race between reclaim unmapping * the page and flushing the page. If this race occurs, it potentially allows * access to data via a stale TLB entry. Tracking all mm's that have TLB * batching in flight would be expensive during reclaim so instead track * whether TLB batching occurred in the past and if so then do a flush here * if required. This will cost one additional flush per reclaim cycle paid * by the first operation at risk such as mprotect and mumap. * * This must be called under the PTL so that an access to tlb_flush_batched * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise * via the PTL. */ void flush_tlb_batched_pending(struct mm_struct *mm) { int batch = atomic_read(&mm->tlb_flush_batched); int pending = batch & TLB_FLUSH_BATCH_PENDING_MASK; int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT; if (pending != flushed) { arch_flush_tlb_batched_pending(mm); /* * If the new TLB flushing is pending during flushing, leave * mm->tlb_flush_batched as is, to avoid losing flushing. */ atomic_cmpxchg(&mm->tlb_flush_batched, batch, pending | (pending << TLB_FLUSH_BATCH_FLUSHED_SHIFT)); } } #else static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval, unsigned long uaddr) { } static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) { return false; } #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ /** * page_address_in_vma - The virtual address of a page in this VMA. * @folio: The folio containing the page. * @page: The page within the folio. * @vma: The VMA we need to know the address in. * * Calculates the user virtual address of this page in the specified VMA. * It is the caller's responsibililty to check the page is actually * within the VMA. There may not currently be a PTE pointing at this * page, but if a page fault occurs at this address, this is the page * which will be accessed. * * Context: Caller should hold a reference to the folio. Caller should * hold a lock (eg the i_mmap_lock or the mmap_lock) which keeps the * VMA from being altered. * * Return: The virtual address corresponding to this page in the VMA. */ unsigned long page_address_in_vma(const struct folio *folio, const struct page *page, const struct vm_area_struct *vma) { if (folio_test_anon(folio)) { struct anon_vma *page__anon_vma = folio_anon_vma(folio); /* * Note: swapoff's unuse_vma() is more efficient with this * check, and needs it to match anon_vma when KSM is active. */ if (!vma->anon_vma || !page__anon_vma || vma->anon_vma->root != page__anon_vma->root) return -EFAULT; } else if (!vma->vm_file) { return -EFAULT; } else if (vma->vm_file->f_mapping != folio->mapping) { return -EFAULT; } /* KSM folios don't reach here because of the !page__anon_vma check */ return vma_address(vma, page_pgoff(folio, page), 1); } /* * Returns the actual pmd_t* where we expect 'address' to be mapped from, or * NULL if it doesn't exist. No guarantees / checks on what the pmd_t* * represents. */ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd = NULL; pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) goto out; p4d = p4d_offset(pgd, address); if (!p4d_present(*p4d)) goto out; pud = pud_offset(p4d, address); if (!pud_present(*pud)) goto out; pmd = pmd_offset(pud, address); out: return pmd; } struct folio_referenced_arg { int mapcount; int referenced; unsigned long vm_flags; struct mem_cgroup *memcg; }; /* * arg: folio_referenced_arg will be passed */ static bool folio_referenced_one(struct folio *folio, struct vm_area_struct *vma, unsigned long address, void *arg) { struct folio_referenced_arg *pra = arg; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); int referenced = 0; unsigned long start = address, ptes = 0; while (page_vma_mapped_walk(&pvmw)) { address = pvmw.address; if (vma->vm_flags & VM_LOCKED) { if (!folio_test_large(folio) || !pvmw.pte) { /* Restore the mlock which got missed */ mlock_vma_folio(folio, vma); page_vma_mapped_walk_done(&pvmw); pra->vm_flags |= VM_LOCKED; return false; /* To break the loop */ } /* * For large folio fully mapped to VMA, will * be handled after the pvmw loop. * * For large folio cross VMA boundaries, it's * expected to be picked by page reclaim. But * should skip reference of pages which are in * the range of VM_LOCKED vma. As page reclaim * should just count the reference of pages out * the range of VM_LOCKED vma. */ ptes++; pra->mapcount--; continue; } /* * Skip the non-shared swapbacked folio mapped solely by * the exiting or OOM-reaped process. This avoids redundant * swap-out followed by an immediate unmap. */ if ((!atomic_read(&vma->vm_mm->mm_users) || check_stable_address_space(vma->vm_mm)) && folio_test_anon(folio) && folio_test_swapbacked(folio) && !folio_likely_mapped_shared(folio)) { pra->referenced = -1; page_vma_mapped_walk_done(&pvmw); return false; } if (lru_gen_enabled() && pvmw.pte) { if (lru_gen_look_around(&pvmw)) referenced++; } else if (pvmw.pte) { if (ptep_clear_flush_young_notify(vma, address, pvmw.pte)) referenced++; } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { if (pmdp_clear_flush_young_notify(vma, address, pvmw.pmd)) referenced++; } else { /* unexpected pmd-mapped folio? */ WARN_ON_ONCE(1); } pra->mapcount--; } if ((vma->vm_flags & VM_LOCKED) && folio_test_large(folio) && folio_within_vma(folio, vma)) { unsigned long s_align, e_align; s_align = ALIGN_DOWN(start, PMD_SIZE); e_align = ALIGN_DOWN(start + folio_size(folio) - 1, PMD_SIZE); /* folio doesn't cross page table boundary and fully mapped */ if ((s_align == e_align) && (ptes == folio_nr_pages(folio))) { /* Restore the mlock which got missed */ mlock_vma_folio(folio, vma); pra->vm_flags |= VM_LOCKED; return false; /* To break the loop */ } } if (referenced) folio_clear_idle(folio); if (folio_test_clear_young(folio)) referenced++; if (referenced) { pra->referenced++; pra->vm_flags |= vma->vm_flags & ~VM_LOCKED; } if (!pra->mapcount) return false; /* To break the loop */ return true; } static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg) { struct folio_referenced_arg *pra = arg; struct mem_cgroup *memcg = pra->memcg; /* * Ignore references from this mapping if it has no recency. If the * folio has been used in another mapping, we will catch it; if this * other mapping is already gone, the unmap path will have set the * referenced flag or activated the folio in zap_pte_range(). */ if (!vma_has_recency(vma)) return true; /* * If we are reclaiming on behalf of a cgroup, skip counting on behalf * of references from different cgroups. */ if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) return true; return false; } /** * folio_referenced() - Test if the folio was referenced. * @folio: The folio to test. * @is_locked: Caller holds lock on the folio. * @memcg: target memory cgroup * @vm_flags: A combination of all the vma->vm_flags which referenced the folio. * * Quick test_and_clear_referenced for all mappings of a folio, * * Return: The number of mappings which referenced the folio. Return -1 if * the function bailed out due to rmap lock contention. */ int folio_referenced(struct folio *folio, int is_locked, struct mem_cgroup *memcg, unsigned long *vm_flags) { bool we_locked = false; struct folio_referenced_arg pra = { .mapcount = folio_mapcount(folio), .memcg = memcg, }; struct rmap_walk_control rwc = { .rmap_one = folio_referenced_one, .arg = (void *)&pra, .anon_lock = folio_lock_anon_vma_read, .try_lock = true, .invalid_vma = invalid_folio_referenced_vma, }; *vm_flags = 0; if (!pra.mapcount) return 0; if (!folio_raw_mapping(folio)) return 0; if (!is_locked && (!folio_test_anon(folio) || folio_test_ksm(folio))) { we_locked = folio_trylock(folio); if (!we_locked) return 1; } rmap_walk(folio, &rwc); *vm_flags = pra.vm_flags; if (we_locked) folio_unlock(folio); return rwc.contended ? -1 : pra.referenced; } static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw) { int cleaned = 0; struct vm_area_struct *vma = pvmw->vma; struct mmu_notifier_range range; unsigned long address = pvmw->address; /* * We have to assume the worse case ie pmd for invalidation. Note that * the folio can not be freed from this function. */ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0, vma->vm_mm, address, vma_address_end(pvmw)); mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(pvmw)) { int ret = 0; address = pvmw->address; if (pvmw->pte) { pte_t *pte = pvmw->pte; pte_t entry = ptep_get(pte); if (!pte_dirty(entry) && !pte_write(entry)) continue; flush_cache_page(vma, address, pte_pfn(entry)); entry = ptep_clear_flush(vma, address, pte); entry = pte_wrprotect(entry); entry = pte_mkclean(entry); set_pte_at(vma->vm_mm, address, pte, entry); ret = 1; } else { #ifdef CONFIG_TRANSPARENT_HUGEPAGE pmd_t *pmd = pvmw->pmd; pmd_t entry; if (!pmd_dirty(*pmd) && !pmd_write(*pmd)) continue; flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); entry = pmdp_invalidate(vma, address, pmd); entry = pmd_wrprotect(entry); entry = pmd_mkclean(entry); set_pmd_at(vma->vm_mm, address, pmd, entry); ret = 1; #else /* unexpected pmd-mapped folio? */ WARN_ON_ONCE(1); #endif } if (ret) cleaned++; } mmu_notifier_invalidate_range_end(&range); return cleaned; } static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma, unsigned long address, void *arg) { DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC); int *cleaned = arg; *cleaned += page_vma_mkclean_one(&pvmw); return true; } static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) { if (vma->vm_flags & VM_SHARED) return false; return true; } int folio_mkclean(struct folio *folio) { int cleaned = 0; struct address_space *mapping; struct rmap_walk_control rwc = { .arg = (void *)&cleaned, .rmap_one = page_mkclean_one, .invalid_vma = invalid_mkclean_vma, }; BUG_ON(!folio_test_locked(folio)); if (!folio_mapped(folio)) return 0; mapping = folio_mapping(folio); if (!mapping) return 0; rmap_walk(folio, &rwc); return cleaned; } EXPORT_SYMBOL_GPL(folio_mkclean); /** * pfn_mkclean_range - Cleans the PTEs (including PMDs) mapped with range of * [@pfn, @pfn + @nr_pages) at the specific offset (@pgoff) * within the @vma of shared mappings. And since clean PTEs * should also be readonly, write protects them too. * @pfn: start pfn. * @nr_pages: number of physically contiguous pages srarting with @pfn. * @pgoff: page offset that the @pfn mapped with. * @vma: vma that @pfn mapped within. * * Returns the number of cleaned PTEs (including PMDs). */ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, struct vm_area_struct *vma) { struct page_vma_mapped_walk pvmw = { .pfn = pfn, .nr_pages = nr_pages, .pgoff = pgoff, .vma = vma, .flags = PVMW_SYNC, }; if (invalid_mkclean_vma(vma, NULL)) return 0; pvmw.address = vma_address(vma, pgoff, nr_pages); VM_BUG_ON_VMA(pvmw.address == -EFAULT, vma); return page_vma_mkclean_one(&pvmw); } static __always_inline unsigned int __folio_add_rmap(struct folio *folio, struct page *page, int nr_pages, enum rmap_level level, int *nr_pmdmapped) { atomic_t *mapped = &folio->_nr_pages_mapped; const int orig_nr_pages = nr_pages; int first = 0, nr = 0; __folio_rmap_sanity_checks(folio, page, nr_pages, level); switch (level) { case RMAP_LEVEL_PTE: if (!folio_test_large(folio)) { nr = atomic_inc_and_test(&folio->_mapcount); break; } do { first += atomic_inc_and_test(&page->_mapcount); } while (page++, --nr_pages > 0); if (first && atomic_add_return_relaxed(first, mapped) < ENTIRELY_MAPPED) nr = first; atomic_add(orig_nr_pages, &folio->_large_mapcount); break; case RMAP_LEVEL_PMD: first = atomic_inc_and_test(&folio->_entire_mapcount); if (first) { nr = atomic_add_return_relaxed(ENTIRELY_MAPPED, mapped); if (likely(nr < ENTIRELY_MAPPED + ENTIRELY_MAPPED)) { *nr_pmdmapped = folio_nr_pages(folio); nr = *nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of a remove and another add? */ if (unlikely(nr < 0)) nr = 0; } else { /* Raced ahead of a remove of ENTIRELY_MAPPED */ nr = 0; } } atomic_inc(&folio->_large_mapcount); break; } return nr; } /** * folio_move_anon_rmap - move a folio to our anon_vma * @folio: The folio to move to our anon_vma * @vma: The vma the folio belongs to * * When a folio belongs exclusively to one process after a COW event, * that folio can be moved into the anon_vma that belongs to just that * process, so the rmap code will not search the parent or sibling processes. */ void folio_move_anon_rmap(struct folio *folio, struct vm_area_struct *vma) { void *anon_vma = vma->anon_vma; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_VMA(!anon_vma, vma); anon_vma += PAGE_MAPPING_ANON; /* * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written * simultaneously, so a concurrent reader (eg folio_referenced()'s * folio_test_anon()) will not see one without the other. */ WRITE_ONCE(folio->mapping, anon_vma); } /** * __folio_set_anon - set up a new anonymous rmap for a folio * @folio: The folio to set up the new anonymous rmap for. * @vma: VM area to add the folio to. * @address: User virtual address of the mapping * @exclusive: Whether the folio is exclusive to the process. */ static void __folio_set_anon(struct folio *folio, struct vm_area_struct *vma, unsigned long address, bool exclusive) { struct anon_vma *anon_vma = vma->anon_vma; BUG_ON(!anon_vma); /* * If the folio isn't exclusive to this vma, we must use the _oldest_ * possible anon_vma for the folio mapping! */ if (!exclusive) anon_vma = anon_vma->root; /* * page_idle does a lockless/optimistic rmap scan on folio->mapping. * Make sure the compiler doesn't split the stores of anon_vma and * the PAGE_MAPPING_ANON type identifier, otherwise the rmap code * could mistake the mapping for a struct address_space and crash. */ anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; WRITE_ONCE(folio->mapping, (struct address_space *) anon_vma); folio->index = linear_page_index(vma, address); } /** * __page_check_anon_rmap - sanity check anonymous rmap addition * @folio: The folio containing @page. * @page: the page to check the mapping of * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped */ static void __page_check_anon_rmap(const struct folio *folio, const struct page *page, struct vm_area_struct *vma, unsigned long address) { /* * The page's anon-rmap details (mapping and index) are guaranteed to * be set up correctly at this point. * * We have exclusion against folio_add_anon_rmap_*() because the caller * always holds the page locked. * * We have exclusion against folio_add_new_anon_rmap because those pages * are initially only visible via the pagetables, and the pte is locked * over the call to folio_add_new_anon_rmap. */ VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root, folio); VM_BUG_ON_PAGE(page_pgoff(folio, page) != linear_page_index(vma, address), page); } static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped) { int idx; if (nr) { idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED; __lruvec_stat_mod_folio(folio, idx, nr); } if (nr_pmdmapped) { if (folio_test_anon(folio)) { idx = NR_ANON_THPS; __lruvec_stat_mod_folio(folio, idx, nr_pmdmapped); } else { /* NR_*_PMDMAPPED are not maintained per-memcg */ idx = folio_test_swapbacked(folio) ? NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED; __mod_node_page_state(folio_pgdat(folio), idx, nr_pmdmapped); } } } static __always_inline void __folio_add_anon_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, unsigned long address, rmap_t flags, enum rmap_level level) { int i, nr, nr_pmdmapped = 0; VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); nr = __folio_add_rmap(folio, page, nr_pages, level, &nr_pmdmapped); if (likely(!folio_test_ksm(folio))) __page_check_anon_rmap(folio, page, vma, address); __folio_mod_stat(folio, nr, nr_pmdmapped); if (flags & RMAP_EXCLUSIVE) { switch (level) { case RMAP_LEVEL_PTE: for (i = 0; i < nr_pages; i++) SetPageAnonExclusive(page + i); break; case RMAP_LEVEL_PMD: SetPageAnonExclusive(page); break; } } for (i = 0; i < nr_pages; i++) { struct page *cur_page = page + i; /* While PTE-mapping a THP we have a PMD and a PTE mapping. */ VM_WARN_ON_FOLIO((atomic_read(&cur_page->_mapcount) > 0 || (folio_test_large(folio) && folio_entire_mapcount(folio) > 1)) && PageAnonExclusive(cur_page), folio); } /* * For large folio, only mlock it if it's fully mapped to VMA. It's * not easy to check whether the large folio is fully mapped to VMA * here. Only mlock normal 4K folio and leave page reclaim to handle * large folio. */ if (!folio_test_large(folio)) mlock_vma_folio(folio, vma); } /** * folio_add_anon_rmap_ptes - add PTE mappings to a page range of an anon folio * @folio: The folio to add the mappings to * @page: The first page to add * @nr_pages: The number of pages which will be mapped * @vma: The vm area in which the mappings are added * @address: The user virtual address of the first page to map * @flags: The rmap flags * * The page range of folio is defined by [first_page, first_page + nr_pages) * * The caller needs to hold the page table lock, and the page must be locked in * the anon_vma case: to serialize mapping,index checking after setting, * and to ensure that an anon folio is not being upgraded racily to a KSM folio * (but KSM folios are never downgraded). */ void folio_add_anon_rmap_ptes(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, unsigned long address, rmap_t flags) { __folio_add_anon_rmap(folio, page, nr_pages, vma, address, flags, RMAP_LEVEL_PTE); } /** * folio_add_anon_rmap_pmd - add a PMD mapping to a page range of an anon folio * @folio: The folio to add the mapping to * @page: The first page to add * @vma: The vm area in which the mapping is added * @address: The user virtual address of the first page to map * @flags: The rmap flags * * The page range of folio is defined by [first_page, first_page + HPAGE_PMD_NR) * * The caller needs to hold the page table lock, and the page must be locked in * the anon_vma case: to serialize mapping,index checking after setting. */ void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page, struct vm_area_struct *vma, unsigned long address, rmap_t flags) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE __folio_add_anon_rmap(folio, page, HPAGE_PMD_NR, vma, address, flags, RMAP_LEVEL_PMD); #else WARN_ON_ONCE(true); #endif } /** * folio_add_new_anon_rmap - Add mapping to a new anonymous folio. * @folio: The folio to add the mapping to. * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped * @flags: The rmap flags * * Like folio_add_anon_rmap_*() but must only be called on *new* folios. * This means the inc-and-test can be bypassed. * The folio doesn't necessarily need to be locked while it's exclusive * unless two threads map it concurrently. However, the folio must be * locked if it's shared. * * If the folio is pmd-mappable, it is accounted as a THP. */ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, unsigned long address, rmap_t flags) { const int nr = folio_nr_pages(folio); const bool exclusive = flags & RMAP_EXCLUSIVE; int nr_pmdmapped = 0; VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); VM_WARN_ON_FOLIO(!exclusive && !folio_test_locked(folio), folio); VM_BUG_ON_VMA(address < vma->vm_start || address + (nr << PAGE_SHIFT) > vma->vm_end, vma); /* * VM_DROPPABLE mappings don't swap; instead they're just dropped when * under memory pressure. */ if (!folio_test_swapbacked(folio) && !(vma->vm_flags & VM_DROPPABLE)) __folio_set_swapbacked(folio); __folio_set_anon(folio, vma, address, exclusive); if (likely(!folio_test_large(folio))) { /* increment count (starts at -1) */ atomic_set(&folio->_mapcount, 0); if (exclusive) SetPageAnonExclusive(&folio->page); } else if (!folio_test_pmd_mappable(folio)) { int i; for (i = 0; i < nr; i++) { struct page *page = folio_page(folio, i); /* increment count (starts at -1) */ atomic_set(&page->_mapcount, 0); if (exclusive) SetPageAnonExclusive(page); } /* increment count (starts at -1) */ atomic_set(&folio->_large_mapcount, nr - 1); atomic_set(&folio->_nr_pages_mapped, nr); } else { /* increment count (starts at -1) */ atomic_set(&folio->_entire_mapcount, 0); /* increment count (starts at -1) */ atomic_set(&folio->_large_mapcount, 0); atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED); if (exclusive) SetPageAnonExclusive(&folio->page); nr_pmdmapped = nr; } __folio_mod_stat(folio, nr, nr_pmdmapped); mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1); } static __always_inline void __folio_add_file_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, enum rmap_level level) { int nr, nr_pmdmapped = 0; VM_WARN_ON_FOLIO(folio_test_anon(folio), folio); nr = __folio_add_rmap(folio, page, nr_pages, level, &nr_pmdmapped); __folio_mod_stat(folio, nr, nr_pmdmapped); /* See comments in folio_add_anon_rmap_*() */ if (!folio_test_large(folio)) mlock_vma_folio(folio, vma); } /** * folio_add_file_rmap_ptes - add PTE mappings to a page range of a folio * @folio: The folio to add the mappings to * @page: The first page to add * @nr_pages: The number of pages that will be mapped using PTEs * @vma: The vm area in which the mappings are added * * The page range of the folio is defined by [page, page + nr_pages) * * The caller needs to hold the page table lock. */ void folio_add_file_rmap_ptes(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma) { __folio_add_file_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE); } /** * folio_add_file_rmap_pmd - add a PMD mapping to a page range of a folio * @folio: The folio to add the mapping to * @page: The first page to add * @vma: The vm area in which the mapping is added * * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) * * The caller needs to hold the page table lock. */ void folio_add_file_rmap_pmd(struct folio *folio, struct page *page, struct vm_area_struct *vma) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE __folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD); #else WARN_ON_ONCE(true); #endif } static __always_inline void __folio_remove_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, enum rmap_level level) { atomic_t *mapped = &folio->_nr_pages_mapped; int last = 0, nr = 0, nr_pmdmapped = 0; bool partially_mapped = false; __folio_rmap_sanity_checks(folio, page, nr_pages, level); switch (level) { case RMAP_LEVEL_PTE: if (!folio_test_large(folio)) { nr = atomic_add_negative(-1, &folio->_mapcount); break; } atomic_sub(nr_pages, &folio->_large_mapcount); do { last += atomic_add_negative(-1, &page->_mapcount); } while (page++, --nr_pages > 0); if (last && atomic_sub_return_relaxed(last, mapped) < ENTIRELY_MAPPED) nr = last; partially_mapped = nr && atomic_read(mapped); break; case RMAP_LEVEL_PMD: atomic_dec(&folio->_large_mapcount); last = atomic_add_negative(-1, &folio->_entire_mapcount); if (last) { nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, mapped); if (likely(nr < ENTIRELY_MAPPED)) { nr_pmdmapped = folio_nr_pages(folio); nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of another remove and an add? */ if (unlikely(nr < 0)) nr = 0; } else { /* An add of ENTIRELY_MAPPED raced ahead */ nr = 0; } } partially_mapped = nr && nr < nr_pmdmapped; break; } /* * Queue anon large folio for deferred split if at least one page of * the folio is unmapped and at least one page is still mapped. * * Check partially_mapped first to ensure it is a large folio. */ if (partially_mapped && folio_test_anon(folio) && !folio_test_partially_mapped(folio)) deferred_split_folio(folio, true); __folio_mod_stat(folio, -nr, -nr_pmdmapped); /* * It would be tidy to reset folio_test_anon mapping when fully * unmapped, but that might overwrite a racing folio_add_anon_rmap_*() * which increments mapcount after us but sets mapping before us: * so leave the reset to free_pages_prepare, and remember that * it's only reliable while mapped. */ munlock_vma_folio(folio, vma); } /** * folio_remove_rmap_ptes - remove PTE mappings from a page range of a folio * @folio: The folio to remove the mappings from * @page: The first page to remove * @nr_pages: The number of pages that will be removed from the mapping * @vma: The vm area from which the mappings are removed * * The page range of the folio is defined by [page, page + nr_pages) * * The caller needs to hold the page table lock. */ void folio_remove_rmap_ptes(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma) { __folio_remove_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE); } /** * folio_remove_rmap_pmd - remove a PMD mapping from a page range of a folio * @folio: The folio to remove the mapping from * @page: The first page to remove * @vma: The vm area from which the mapping is removed * * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) * * The caller needs to hold the page table lock. */ void folio_remove_rmap_pmd(struct folio *folio, struct page *page, struct vm_area_struct *vma) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE __folio_remove_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD); #else WARN_ON_ONCE(true); #endif } /* * @arg: enum ttu_flags will be passed to this argument */ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); pte_t pteval; struct page *subpage; bool anon_exclusive, ret = true; struct mmu_notifier_range range; enum ttu_flags flags = (enum ttu_flags)(long)arg; unsigned long pfn; unsigned long hsz = 0; /* * When racing against e.g. zap_pte_range() on another cpu, * in between its ptep_get_and_clear_full() and folio_remove_rmap_*(), * try_to_unmap() may return before page_mapped() has become false, * if page table locking is skipped: use TTU_SYNC to wait for that. */ if (flags & TTU_SYNC) pvmw.flags = PVMW_SYNC; /* * For THP, we have to assume the worse case ie pmd for invalidation. * For hugetlb, it could be much worse if we need to do pud * invalidation in the case of pmd sharing. * * Note that the folio can not be freed in this function as call of * try_to_unmap() must hold a reference on the folio. */ range.end = vma_address_end(&pvmw); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, range.end); if (folio_test_hugetlb(folio)) { /* * If sharing is possible, start and end will be adjusted * accordingly. */ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); /* We need the huge page size for set_huge_pte_at() */ hsz = huge_page_size(hstate_vma(vma)); } mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { /* * If the folio is in an mlock()d vma, we must not swap it out. */ if (!(flags & TTU_IGNORE_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /* Restore the mlock which got missed */ if (!folio_test_large(folio)) mlock_vma_folio(folio, vma); goto walk_abort; } if (!pvmw.pte) { if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, folio)) goto walk_done; if (flags & TTU_SPLIT_HUGE_PMD) { /* * We temporarily have to drop the PTL and * restart so we can process the PTE-mapped THP. */ split_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, false, folio); flags &= ~TTU_SPLIT_HUGE_PMD; page_vma_mapped_walk_restart(&pvmw); continue; } } /* Unexpected PMD-mapped THP? */ VM_BUG_ON_FOLIO(!pvmw.pte, folio); pfn = pte_pfn(ptep_get(pvmw.pte)); subpage = folio_page(folio, pfn - folio_pfn(folio)); address = pvmw.address; anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(subpage); if (folio_test_hugetlb(folio)) { bool anon = folio_test_anon(folio); /* * The try_to_unmap() is only passed a hugetlb page * in the case where the hugetlb page is poisoned. */ VM_BUG_ON_PAGE(!PageHWPoison(subpage), subpage); /* * huge_pmd_unshare may unmap an entire PMD page. * There is no way of knowing exactly which PMDs may * be cached for this mm, so we must flush them all. * start/end were already adjusted above to cover this * range. */ flush_cache_range(vma, range.start, range.end); /* * To call huge_pmd_unshare, i_mmap_rwsem must be * held in write mode. Caller needs to explicitly * do this outside rmap routines. * * We also must hold hugetlb vma_lock in write mode. * Lock order dictates acquiring vma_lock BEFORE * i_mmap_rwsem. We can only try lock here and fail * if unsuccessful. */ if (!anon) { VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); if (!hugetlb_vma_trylock_write(vma)) goto walk_abort; if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { hugetlb_vma_unlock_write(vma); flush_tlb_range(vma, range.start, range.end); /* * The ref count of the PMD page was * dropped which is part of the way map * counting is done for shared PMDs. * Return 'true' here. When there is * no other sharing, huge_pmd_unshare * returns false and we will unmap the * actual page and drop map count * to zero. */ goto walk_done; } hugetlb_vma_unlock_write(vma); } pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); } else { flush_cache_page(vma, address, pfn); /* Nuke the page table entry. */ if (should_defer_flush(mm, flags)) { /* * We clear the PTE but do not flush so potentially * a remote CPU could still be writing to the folio. * If the entry was previously clean then the * architecture must guarantee that a clear->dirty * transition on a cached TLB entry is written through * and traps if the PTE is unmapped. */ pteval = ptep_get_and_clear(mm, address, pvmw.pte); set_tlb_ubc_flush_pending(mm, pteval, address); } else { pteval = ptep_clear_flush(vma, address, pvmw.pte); } } /* * Now the pte is cleared. If this pte was uffd-wp armed, * we may want to replace a none pte with a marker pte if * it's file-backed, so we don't lose the tracking info. */ pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval); /* Set the dirty flag on the folio now the pte is gone. */ if (pte_dirty(pteval)) folio_mark_dirty(folio); /* Update high watermark before we lower rss */ update_hiwater_rss(mm); if (PageHWPoison(subpage) && (flags & TTU_HWPOISON)) { pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); if (folio_test_hugetlb(folio)) { hugetlb_count_sub(folio_nr_pages(folio), mm); set_huge_pte_at(mm, address, pvmw.pte, pteval, hsz); } else { dec_mm_counter(mm, mm_counter(folio)); set_pte_at(mm, address, pvmw.pte, pteval); } } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) { /* * The guest indicated that the page content is of no * interest anymore. Simply discard the pte, vmscan * will take care of the rest. * A future reference will then fault in a new zero * page. When userfaultfd is active, we must not drop * this page though, as its main user (postcopy * migration) will not expect userfaults on already * copied pages. */ dec_mm_counter(mm, mm_counter(folio)); } else if (folio_test_anon(folio)) { swp_entry_t entry = page_swap_entry(subpage); pte_t swp_pte; /* * Store the swap location in the pte. * See handle_pte_fault() ... */ if (unlikely(folio_test_swapbacked(folio) != folio_test_swapcache(folio))) { WARN_ON_ONCE(1); goto walk_abort; } /* MADV_FREE page check */ if (!folio_test_swapbacked(folio)) { int ref_count, map_count; /* * Synchronize with gup_pte_range(): * - clear PTE; barrier; read refcount * - inc refcount; barrier; read PTE */ smp_mb(); ref_count = folio_ref_count(folio); map_count = folio_mapcount(folio); /* * Order reads for page refcount and dirty flag * (see comments in __remove_mapping()). */ smp_rmb(); /* * The only page refs must be one from isolation * plus the rmap(s) (dropped by discard:). */ if (ref_count == 1 + map_count && (!folio_test_dirty(folio) || /* * Unlike MADV_FREE mappings, VM_DROPPABLE * ones can be dropped even if they've * been dirtied. */ (vma->vm_flags & VM_DROPPABLE))) { dec_mm_counter(mm, MM_ANONPAGES); goto discard; } /* * If the folio was redirtied, it cannot be * discarded. Remap the page to page table. */ set_pte_at(mm, address, pvmw.pte, pteval); /* * Unlike MADV_FREE mappings, VM_DROPPABLE ones * never get swap backed on failure to drop. */ if (!(vma->vm_flags & VM_DROPPABLE)) folio_set_swapbacked(folio); goto walk_abort; } if (swap_duplicate(entry) < 0) { set_pte_at(mm, address, pvmw.pte, pteval); goto walk_abort; } if (arch_unmap_one(mm, vma, address, pteval) < 0) { swap_free(entry); set_pte_at(mm, address, pvmw.pte, pteval); goto walk_abort; } /* See folio_try_share_anon_rmap(): clear PTE first. */ if (anon_exclusive && folio_try_share_anon_rmap_pte(folio, subpage)) { swap_free(entry); set_pte_at(mm, address, pvmw.pte, pteval); goto walk_abort; } if (list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); if (list_empty(&mm->mmlist)) list_add(&mm->mmlist, &init_mm.mmlist); spin_unlock(&mmlist_lock); } dec_mm_counter(mm, MM_ANONPAGES); inc_mm_counter(mm, MM_SWAPENTS); swp_pte = swp_entry_to_pte(entry); if (anon_exclusive) swp_pte = pte_swp_mkexclusive(swp_pte); if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); if (pte_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, address, pvmw.pte, swp_pte); } else { /* * This is a locked file-backed folio, * so it cannot be removed from the page * cache and replaced by a new folio before * mmu_notifier_invalidate_range_end, so no * concurrent thread might update its page table * to point at a new folio while a device is * still using this folio. * * See Documentation/mm/mmu_notifier.rst */ dec_mm_counter(mm, mm_counter_file(folio)); } discard: if (unlikely(folio_test_hugetlb(folio))) hugetlb_remove_rmap(folio); else folio_remove_rmap_pte(folio, subpage, vma); if (vma->vm_flags & VM_LOCKED) mlock_drain_local(); folio_put(folio); continue; walk_abort: ret = false; walk_done: page_vma_mapped_walk_done(&pvmw); break; } mmu_notifier_invalidate_range_end(&range); return ret; } static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) { return vma_is_temporary_stack(vma); } static int folio_not_mapped(struct folio *folio) { return !folio_mapped(folio); } /** * try_to_unmap - Try to remove all page table mappings to a folio. * @folio: The folio to unmap. * @flags: action and flags * * Tries to remove all the page table entries which are mapping this * folio. It is the caller's responsibility to check if the folio is * still mapped if needed (use TTU_SYNC to prevent accounting races). * * Context: Caller must hold the folio lock. */ void try_to_unmap(struct folio *folio, enum ttu_flags flags) { struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, .arg = (void *)flags, .done = folio_not_mapped, .anon_lock = folio_lock_anon_vma_read, }; if (flags & TTU_RMAP_LOCKED) rmap_walk_locked(folio, &rwc); else rmap_walk(folio, &rwc); } /* * @arg: enum ttu_flags will be passed to this argument. * * If TTU_SPLIT_HUGE_PMD is specified any PMD mappings will be split into PTEs * containing migration entries. */ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); pte_t pteval; struct page *subpage; bool anon_exclusive, ret = true; struct mmu_notifier_range range; enum ttu_flags flags = (enum ttu_flags)(long)arg; unsigned long pfn; unsigned long hsz = 0; /* * When racing against e.g. zap_pte_range() on another cpu, * in between its ptep_get_and_clear_full() and folio_remove_rmap_*(), * try_to_migrate() may return before page_mapped() has become false, * if page table locking is skipped: use TTU_SYNC to wait for that. */ if (flags & TTU_SYNC) pvmw.flags = PVMW_SYNC; /* * unmap_page() in mm/huge_memory.c is the only user of migration with * TTU_SPLIT_HUGE_PMD and it wants to freeze. */ if (flags & TTU_SPLIT_HUGE_PMD) split_huge_pmd_address(vma, address, true, folio); /* * For THP, we have to assume the worse case ie pmd for invalidation. * For hugetlb, it could be much worse if we need to do pud * invalidation in the case of pmd sharing. * * Note that the page can not be free in this function as call of * try_to_unmap() must hold a reference on the page. */ range.end = vma_address_end(&pvmw); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, range.end); if (folio_test_hugetlb(folio)) { /* * If sharing is possible, start and end will be adjusted * accordingly. */ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); /* We need the huge page size for set_huge_pte_at() */ hsz = huge_page_size(hstate_vma(vma)); } mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION /* PMD-mapped THP migration entry */ if (!pvmw.pte) { subpage = folio_page(folio, pmd_pfn(*pvmw.pmd) - folio_pfn(folio)); VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) || !folio_test_pmd_mappable(folio), folio); if (set_pmd_migration_entry(&pvmw, subpage)) { ret = false; page_vma_mapped_walk_done(&pvmw); break; } continue; } #endif /* Unexpected PMD-mapped THP? */ VM_BUG_ON_FOLIO(!pvmw.pte, folio); pfn = pte_pfn(ptep_get(pvmw.pte)); if (folio_is_zone_device(folio)) { /* * Our PTE is a non-present device exclusive entry and * calculating the subpage as for the common case would * result in an invalid pointer. * * Since only PAGE_SIZE pages can currently be * migrated, just set it to page. This will need to be * changed when hugepage migrations to device private * memory are supported. */ VM_BUG_ON_FOLIO(folio_nr_pages(folio) > 1, folio); subpage = &folio->page; } else { subpage = folio_page(folio, pfn - folio_pfn(folio)); } address = pvmw.address; anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(subpage); if (folio_test_hugetlb(folio)) { bool anon = folio_test_anon(folio); /* * huge_pmd_unshare may unmap an entire PMD page. * There is no way of knowing exactly which PMDs may * be cached for this mm, so we must flush them all. * start/end were already adjusted above to cover this * range. */ flush_cache_range(vma, range.start, range.end); /* * To call huge_pmd_unshare, i_mmap_rwsem must be * held in write mode. Caller needs to explicitly * do this outside rmap routines. * * We also must hold hugetlb vma_lock in write mode. * Lock order dictates acquiring vma_lock BEFORE * i_mmap_rwsem. We can only try lock here and * fail if unsuccessful. */ if (!anon) { VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); if (!hugetlb_vma_trylock_write(vma)) { page_vma_mapped_walk_done(&pvmw); ret = false; break; } if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { hugetlb_vma_unlock_write(vma); flush_tlb_range(vma, range.start, range.end); /* * The ref count of the PMD page was * dropped which is part of the way map * counting is done for shared PMDs. * Return 'true' here. When there is * no other sharing, huge_pmd_unshare * returns false and we will unmap the * actual page and drop map count * to zero. */ page_vma_mapped_walk_done(&pvmw); break; } hugetlb_vma_unlock_write(vma); } /* Nuke the hugetlb page table entry */ pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); } else { flush_cache_page(vma, address, pfn); /* Nuke the page table entry. */ if (should_defer_flush(mm, flags)) { /* * We clear the PTE but do not flush so potentially * a remote CPU could still be writing to the folio. * If the entry was previously clean then the * architecture must guarantee that a clear->dirty * transition on a cached TLB entry is written through * and traps if the PTE is unmapped. */ pteval = ptep_get_and_clear(mm, address, pvmw.pte); set_tlb_ubc_flush_pending(mm, pteval, address); } else { pteval = ptep_clear_flush(vma, address, pvmw.pte); } } /* Set the dirty flag on the folio now the pte is gone. */ if (pte_dirty(pteval)) folio_mark_dirty(folio); /* Update high watermark before we lower rss */ update_hiwater_rss(mm); if (folio_is_device_private(folio)) { unsigned long pfn = folio_pfn(folio); swp_entry_t entry; pte_t swp_pte; if (anon_exclusive) WARN_ON_ONCE(folio_try_share_anon_rmap_pte(folio, subpage)); /* * Store the pfn of the page in a special migration * pte. do_swap_page() will wait until the migration * pte is removed and then restart fault handling. */ entry = pte_to_swp_entry(pteval); if (is_writable_device_private_entry(entry)) entry = make_writable_migration_entry(pfn); else if (anon_exclusive) entry = make_readable_exclusive_migration_entry(pfn); else entry = make_readable_migration_entry(pfn); swp_pte = swp_entry_to_pte(entry); /* * pteval maps a zone device page and is therefore * a swap pte. */ if (pte_swp_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); if (pte_swp_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); trace_set_migration_pte(pvmw.address, pte_val(swp_pte), folio_order(folio)); /* * No need to invalidate here it will synchronize on * against the special swap migration pte. */ } else if (PageHWPoison(subpage)) { pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); if (folio_test_hugetlb(folio)) { hugetlb_count_sub(folio_nr_pages(folio), mm); set_huge_pte_at(mm, address, pvmw.pte, pteval, hsz); } else { dec_mm_counter(mm, mm_counter(folio)); set_pte_at(mm, address, pvmw.pte, pteval); } } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) { /* * The guest indicated that the page content is of no * interest anymore. Simply discard the pte, vmscan * will take care of the rest. * A future reference will then fault in a new zero * page. When userfaultfd is active, we must not drop * this page though, as its main user (postcopy * migration) will not expect userfaults on already * copied pages. */ dec_mm_counter(mm, mm_counter(folio)); } else { swp_entry_t entry; pte_t swp_pte; if (arch_unmap_one(mm, vma, address, pteval) < 0) { if (folio_test_hugetlb(folio)) set_huge_pte_at(mm, address, pvmw.pte, pteval, hsz); else set_pte_at(mm, address, pvmw.pte, pteval); ret = false; page_vma_mapped_walk_done(&pvmw); break; } VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) && !anon_exclusive, subpage); /* See folio_try_share_anon_rmap_pte(): clear PTE first. */ if (folio_test_hugetlb(folio)) { if (anon_exclusive && hugetlb_try_share_anon_rmap(folio)) { set_huge_pte_at(mm, address, pvmw.pte, pteval, hsz); ret = false; page_vma_mapped_walk_done(&pvmw); break; } } else if (anon_exclusive && folio_try_share_anon_rmap_pte(folio, subpage)) { set_pte_at(mm, address, pvmw.pte, pteval); ret = false; page_vma_mapped_walk_done(&pvmw); break; } /* * Store the pfn of the page in a special migration * pte. do_swap_page() will wait until the migration * pte is removed and then restart fault handling. */ if (pte_write(pteval)) entry = make_writable_migration_entry( page_to_pfn(subpage)); else if (anon_exclusive) entry = make_readable_exclusive_migration_entry( page_to_pfn(subpage)); else entry = make_readable_migration_entry( page_to_pfn(subpage)); if (pte_young(pteval)) entry = make_migration_entry_young(entry); if (pte_dirty(pteval)) entry = make_migration_entry_dirty(entry); swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); if (pte_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); if (folio_test_hugetlb(folio)) set_huge_pte_at(mm, address, pvmw.pte, swp_pte, hsz); else set_pte_at(mm, address, pvmw.pte, swp_pte); trace_set_migration_pte(address, pte_val(swp_pte), folio_order(folio)); /* * No need to invalidate here it will synchronize on * against the special swap migration pte. */ } if (unlikely(folio_test_hugetlb(folio))) hugetlb_remove_rmap(folio); else folio_remove_rmap_pte(folio, subpage, vma); if (vma->vm_flags & VM_LOCKED) mlock_drain_local(); folio_put(folio); } mmu_notifier_invalidate_range_end(&range); return ret; } /** * try_to_migrate - try to replace all page table mappings with swap entries * @folio: the folio to replace page table entries for * @flags: action and flags * * Tries to remove all the page table entries which are mapping this folio and * replace them with special swap entries. Caller must hold the folio lock. */ void try_to_migrate(struct folio *folio, enum ttu_flags flags) { struct rmap_walk_control rwc = { .rmap_one = try_to_migrate_one, .arg = (void *)flags, .done = folio_not_mapped, .anon_lock = folio_lock_anon_vma_read, }; /* * Migration always ignores mlock and only supports TTU_RMAP_LOCKED and * TTU_SPLIT_HUGE_PMD, TTU_SYNC, and TTU_BATCH_FLUSH flags. */ if (WARN_ON_ONCE(flags & ~(TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | TTU_SYNC | TTU_BATCH_FLUSH))) return; if (folio_is_zone_device(folio) && (!folio_is_device_private(folio) && !folio_is_device_coherent(folio))) return; /* * During exec, a temporary VMA is setup and later moved. * The VMA is moved under the anon_vma lock but not the * page tables leading to a race where migration cannot * find the migration ptes. Rather than increasing the * locking requirements of exec(), migration skips * temporary VMAs until after exec() completes. */ if (!folio_test_ksm(folio) && folio_test_anon(folio)) rwc.invalid_vma = invalid_migration_vma; if (flags & TTU_RMAP_LOCKED) rmap_walk_locked(folio, &rwc); else rmap_walk(folio, &rwc); } #ifdef CONFIG_DEVICE_PRIVATE struct make_exclusive_args { struct mm_struct *mm; unsigned long address; void *owner; bool valid; }; static bool page_make_device_exclusive_one(struct folio *folio, struct vm_area_struct *vma, unsigned long address, void *priv) { struct mm_struct *mm = vma->vm_mm; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); struct make_exclusive_args *args = priv; pte_t pteval; struct page *subpage; bool ret = true; struct mmu_notifier_range range; swp_entry_t entry; pte_t swp_pte; pte_t ptent; mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma->vm_mm, address, min(vma->vm_end, address + folio_size(folio)), args->owner); mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { /* Unexpected PMD-mapped THP? */ VM_BUG_ON_FOLIO(!pvmw.pte, folio); ptent = ptep_get(pvmw.pte); if (!pte_present(ptent)) { ret = false; page_vma_mapped_walk_done(&pvmw); break; } subpage = folio_page(folio, pte_pfn(ptent) - folio_pfn(folio)); address = pvmw.address; /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(ptent)); pteval = ptep_clear_flush(vma, address, pvmw.pte); /* Set the dirty flag on the folio now the pte is gone. */ if (pte_dirty(pteval)) folio_mark_dirty(folio); /* * Check that our target page is still mapped at the expected * address. */ if (args->mm == mm && args->address == address && pte_write(pteval)) args->valid = true; /* * Store the pfn of the page in a special migration * pte. do_swap_page() will wait until the migration * pte is removed and then restart fault handling. */ if (pte_write(pteval)) entry = make_writable_device_exclusive_entry( page_to_pfn(subpage)); else entry = make_readable_device_exclusive_entry( page_to_pfn(subpage)); swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); if (pte_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, address, pvmw.pte, swp_pte); /* * There is a reference on the page for the swap entry which has * been removed, so shouldn't take another. */ folio_remove_rmap_pte(folio, subpage, vma); } mmu_notifier_invalidate_range_end(&range); return ret; } /** * folio_make_device_exclusive - Mark the folio exclusively owned by a device. * @folio: The folio to replace page table entries for. * @mm: The mm_struct where the folio is expected to be mapped. * @address: Address where the folio is expected to be mapped. * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier callbacks * * Tries to remove all the page table entries which are mapping this * folio and replace them with special device exclusive swap entries to * grant a device exclusive access to the folio. * * Context: Caller must hold the folio lock. * Return: false if the page is still mapped, or if it could not be unmapped * from the expected address. Otherwise returns true (success). */ static bool folio_make_device_exclusive(struct folio *folio, struct mm_struct *mm, unsigned long address, void *owner) { struct make_exclusive_args args = { .mm = mm, .address = address, .owner = owner, .valid = false, }; struct rmap_walk_control rwc = { .rmap_one = page_make_device_exclusive_one, .done = folio_not_mapped, .anon_lock = folio_lock_anon_vma_read, .arg = &args, }; /* * Restrict to anonymous folios for now to avoid potential writeback * issues. */ if (!folio_test_anon(folio)) return false; rmap_walk(folio, &rwc); return args.valid && !folio_mapcount(folio); } /** * make_device_exclusive_range() - Mark a range for exclusive use by a device * @mm: mm_struct of associated target process * @start: start of the region to mark for exclusive device access * @end: end address of region * @pages: returns the pages which were successfully marked for exclusive access * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering * * Returns: number of pages found in the range by GUP. A page is marked for * exclusive access only if the page pointer is non-NULL. * * This function finds ptes mapping page(s) to the given address range, locks * them and replaces mappings with special swap entries preventing userspace CPU * access. On fault these entries are replaced with the original mapping after * calling MMU notifiers. * * A driver using this to program access from a device must use a mmu notifier * critical section to hold a device specific lock during programming. Once * programming is complete it should drop the page lock and reference after * which point CPU access to the page will revoke the exclusive access. */ int make_device_exclusive_range(struct mm_struct *mm, unsigned long start, unsigned long end, struct page **pages, void *owner) { long npages = (end - start) >> PAGE_SHIFT; long i; npages = get_user_pages_remote(mm, start, npages, FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD, pages, NULL); if (npages < 0) return npages; for (i = 0; i < npages; i++, start += PAGE_SIZE) { struct folio *folio = page_folio(pages[i]); if (PageTail(pages[i]) || !folio_trylock(folio)) { folio_put(folio); pages[i] = NULL; continue; } if (!folio_make_device_exclusive(folio, mm, start, owner)) { folio_unlock(folio); folio_put(folio); pages[i] = NULL; } } return npages; } EXPORT_SYMBOL_GPL(make_device_exclusive_range); #endif void __put_anon_vma(struct anon_vma *anon_vma) { struct anon_vma *root = anon_vma->root; anon_vma_free(anon_vma); if (root != anon_vma && atomic_dec_and_test(&root->refcount)) anon_vma_free(root); } static struct anon_vma *rmap_walk_anon_lock(const struct folio *folio, struct rmap_walk_control *rwc) { struct anon_vma *anon_vma; if (rwc->anon_lock) return rwc->anon_lock(folio, rwc); /* * Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read() * because that depends on page_mapped(); but not all its usages * are holding mmap_lock. Users without mmap_lock are required to * take a reference count to prevent the anon_vma disappearing */ anon_vma = folio_anon_vma(folio); if (!anon_vma) return NULL; if (anon_vma_trylock_read(anon_vma)) goto out; if (rwc->try_lock) { anon_vma = NULL; rwc->contended = true; goto out; } anon_vma_lock_read(anon_vma); out: return anon_vma; } /* * rmap_walk_anon - do something to anonymous page using the object-based * rmap method * @folio: the folio to be handled * @rwc: control variable according to each walk type * @locked: caller holds relevant rmap lock * * Find all the mappings of a folio using the mapping pointer and the vma * chains contained in the anon_vma struct it points to. */ static void rmap_walk_anon(struct folio *folio, struct rmap_walk_control *rwc, bool locked) { struct anon_vma *anon_vma; pgoff_t pgoff_start, pgoff_end; struct anon_vma_chain *avc; if (locked) { anon_vma = folio_anon_vma(folio); /* anon_vma disappear under us? */ VM_BUG_ON_FOLIO(!anon_vma, folio); } else { anon_vma = rmap_walk_anon_lock(folio, rwc); } if (!anon_vma) return; pgoff_start = folio_pgoff(folio); pgoff_end = pgoff_start + folio_nr_pages(folio) - 1; anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff_start, pgoff_end) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(vma, pgoff_start, folio_nr_pages(folio)); VM_BUG_ON_VMA(address == -EFAULT, vma); cond_resched(); if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; if (!rwc->rmap_one(folio, vma, address, rwc->arg)) break; if (rwc->done && rwc->done(folio)) break; } if (!locked) anon_vma_unlock_read(anon_vma); } /* * rmap_walk_file - do something to file page using the object-based rmap method * @folio: the folio to be handled * @rwc: control variable according to each walk type * @locked: caller holds relevant rmap lock * * Find all the mappings of a folio using the mapping pointer and the vma chains * contained in the address_space struct it points to. */ static void rmap_walk_file(struct folio *folio, struct rmap_walk_control *rwc, bool locked) { struct address_space *mapping = folio_mapping(folio); pgoff_t pgoff_start, pgoff_end; struct vm_area_struct *vma; /* * The page lock not only makes sure that page->mapping cannot * suddenly be NULLified by truncation, it makes sure that the * structure at mapping cannot be freed and reused yet, * so we can safely take mapping->i_mmap_rwsem. */ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (!mapping) return; pgoff_start = folio_pgoff(folio); pgoff_end = pgoff_start + folio_nr_pages(folio) - 1; if (!locked) { if (i_mmap_trylock_read(mapping)) goto lookup; if (rwc->try_lock) { rwc->contended = true; return; } i_mmap_lock_read(mapping); } lookup: vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff_start, pgoff_end) { unsigned long address = vma_address(vma, pgoff_start, folio_nr_pages(folio)); VM_BUG_ON_VMA(address == -EFAULT, vma); cond_resched(); if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; if (!rwc->rmap_one(folio, vma, address, rwc->arg)) goto done; if (rwc->done && rwc->done(folio)) goto done; } done: if (!locked) i_mmap_unlock_read(mapping); } void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc) { if (unlikely(folio_test_ksm(folio))) rmap_walk_ksm(folio, rwc); else if (folio_test_anon(folio)) rmap_walk_anon(folio, rwc, false); else rmap_walk_file(folio, rwc, false); } /* Like rmap_walk, but caller holds relevant rmap lock */ void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc) { /* no ksm support for now */ VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio); if (folio_test_anon(folio)) rmap_walk_anon(folio, rwc, true); else rmap_walk_file(folio, rwc, true); } #ifdef CONFIG_HUGETLB_PAGE /* * The following two functions are for anonymous (private mapped) hugepages. * Unlike common anonymous pages, anonymous hugepages have no accounting code * and no lru code, because we handle hugepages differently from common pages. */ void hugetlb_add_anon_rmap(struct folio *folio, struct vm_area_struct *vma, unsigned long address, rmap_t flags) { VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); atomic_inc(&folio->_entire_mapcount); atomic_inc(&folio->_large_mapcount); if (flags & RMAP_EXCLUSIVE) SetPageAnonExclusive(&folio->page); VM_WARN_ON_FOLIO(folio_entire_mapcount(folio) > 1 && PageAnonExclusive(&folio->page), folio); } void hugetlb_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, unsigned long address) { VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); BUG_ON(address < vma->vm_start || address >= vma->vm_end); /* increment count (starts at -1) */ atomic_set(&folio->_entire_mapcount, 0); atomic_set(&folio->_large_mapcount, 0); folio_clear_hugetlb_restore_reserve(folio); __folio_set_anon(folio, vma, address, true); SetPageAnonExclusive(&folio->page); } #endif /* CONFIG_HUGETLB_PAGE */ |
| 90 10 80 15 15 15 4 12 15 15 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/exec.c * * Copyright (C) 1991, 1992 Linus Torvalds */ /* * #!-checking implemented by tytso. */ /* * Demand-loading implemented 01.12.91 - no need to read anything but * the header into memory. The inode of the executable is put into * "current->executable", and page faults do the actual loading. Clean. * * Once more I can proudly say that linux stood up to being changed: it * was less than 2 hours work to get demand-loading completely implemented. * * Demand loading changed July 1993 by Eric Youngdale. Use mmap instead, * current->executable is only used by the procfs. This allows a dispatch * table to check for several different types of binary formats. We keep * trying until we recognize the file or we run out of supported binary * formats. */ #include <linux/kernel_read_file.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/mm.h> #include <linux/stat.h> #include <linux/fcntl.h> #include <linux/swap.h> #include <linux/string.h> #include <linux/init.h> #include <linux/sched/mm.h> #include <linux/sched/coredump.h> #include <linux/sched/signal.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/task.h> #include <linux/pagemap.h> #include <linux/perf_event.h> #include <linux/highmem.h> #include <linux/spinlock.h> #include <linux/key.h> #include <linux/personality.h> #include <linux/binfmts.h> #include <linux/utsname.h> #include <linux/pid_namespace.h> #include <linux/module.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> #include <linux/audit.h> #include <linux/kmod.h> #include <linux/fsnotify.h> #include <linux/fs_struct.h> #include <linux/oom.h> #include <linux/compat.h> #include <linux/vmalloc.h> #include <linux/io_uring.h> #include <linux/syscall_user_dispatch.h> #include <linux/coredump.h> #include <linux/time_namespace.h> #include <linux/user_events.h> #include <linux/rseq.h> #include <linux/ksm.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> #include <asm/tlb.h> #include <trace/events/task.h> #include "internal.h" #include <trace/events/sched.h> static int bprm_creds_from_file(struct linux_binprm *bprm); int suid_dumpable = 0; static LIST_HEAD(formats); static DEFINE_RWLOCK(binfmt_lock); void __register_binfmt(struct linux_binfmt * fmt, int insert) { write_lock(&binfmt_lock); insert ? list_add(&fmt->lh, &formats) : list_add_tail(&fmt->lh, &formats); write_unlock(&binfmt_lock); } EXPORT_SYMBOL(__register_binfmt); void unregister_binfmt(struct linux_binfmt * fmt) { write_lock(&binfmt_lock); list_del(&fmt->lh); write_unlock(&binfmt_lock); } EXPORT_SYMBOL(unregister_binfmt); static inline void put_binfmt(struct linux_binfmt * fmt) { module_put(fmt->module); } bool path_noexec(const struct path *path) { return (path->mnt->mnt_flags & MNT_NOEXEC) || (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC); } #ifdef CONFIG_USELIB /* * Note that a shared library must be both readable and executable due to * security reasons. * * Also note that we take the address to load from the file itself. */ SYSCALL_DEFINE1(uselib, const char __user *, library) { struct linux_binfmt *fmt; struct file *file; struct filename *tmp = getname(library); int error = PTR_ERR(tmp); static const struct open_flags uselib_flags = { .open_flag = O_LARGEFILE | O_RDONLY, .acc_mode = MAY_READ | MAY_EXEC, .intent = LOOKUP_OPEN, .lookup_flags = LOOKUP_FOLLOW, }; if (IS_ERR(tmp)) goto out; file = do_filp_open(AT_FDCWD, tmp, &uselib_flags); putname(tmp); error = PTR_ERR(file); if (IS_ERR(file)) goto out; /* * Check do_open_execat() for an explanation. */ error = -EACCES; if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) || path_noexec(&file->f_path)) goto exit; error = -ENOEXEC; read_lock(&binfmt_lock); list_for_each_entry(fmt, &formats, lh) { if (!fmt->load_shlib) continue; if (!try_module_get(fmt->module)) continue; read_unlock(&binfmt_lock); error = fmt->load_shlib(file); read_lock(&binfmt_lock); put_binfmt(fmt); if (error != -ENOEXEC) break; } read_unlock(&binfmt_lock); exit: fput(file); out: return error; } #endif /* #ifdef CONFIG_USELIB */ #ifdef CONFIG_MMU /* * The nascent bprm->mm is not visible until exec_mmap() but it can * use a lot of memory, account these pages in current->mm temporary * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we * change the counter back via acct_arg_size(0). */ static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) { struct mm_struct *mm = current->mm; long diff = (long)(pages - bprm->vma_pages); if (!mm || !diff) return; bprm->vma_pages = pages; add_mm_counter(mm, MM_ANONPAGES, diff); } static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; struct vm_area_struct *vma = bprm->vma; struct mm_struct *mm = bprm->mm; int ret; /* * Avoid relying on expanding the stack down in GUP (which * does not work for STACK_GROWSUP anyway), and just do it * by hand ahead of time. */ if (write && pos < vma->vm_start) { mmap_write_lock(mm); ret = expand_downwards(vma, pos); if (unlikely(ret < 0)) { mmap_write_unlock(mm); return NULL; } mmap_write_downgrade(mm); } else mmap_read_lock(mm); /* * We are doing an exec(). 'current' is the process * doing the exec and 'mm' is the new process's mm. */ ret = get_user_pages_remote(mm, pos, 1, write ? FOLL_WRITE : 0, &page, NULL); mmap_read_unlock(mm); if (ret <= 0) return NULL; if (write) acct_arg_size(bprm, vma_pages(vma)); return page; } static void put_arg_page(struct page *page) { put_page(page); } static void free_arg_pages(struct linux_binprm *bprm) { } static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, struct page *page) { flush_cache_page(bprm->vma, pos, page_to_pfn(page)); } static int __bprm_mm_init(struct linux_binprm *bprm) { int err; struct vm_area_struct *vma = NULL; struct mm_struct *mm = bprm->mm; bprm->vma = vma = vm_area_alloc(mm); if (!vma) return -ENOMEM; vma_set_anonymous(vma); if (mmap_write_lock_killable(mm)) { err = -EINTR; goto err_free; } /* * Need to be called with mmap write lock * held, to avoid race with ksmd. */ err = ksm_execve(mm); if (err) goto err_ksm; /* * Place the stack at the largest stack address the architecture * supports. Later, we'll move this to an appropriate place. We don't * use STACK_TOP because that can depend on attributes which aren't * configured yet. */ BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); err = insert_vm_struct(mm, vma); if (err) goto err; mm->stack_vm = mm->total_vm = 1; mmap_write_unlock(mm); bprm->p = vma->vm_end - sizeof(void *); return 0; err: ksm_exit(mm); err_ksm: mmap_write_unlock(mm); err_free: bprm->vma = NULL; vm_area_free(vma); return err; } static bool valid_arg_len(struct linux_binprm *bprm, long len) { return len <= MAX_ARG_STRLEN; } #else static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) { } static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; page = bprm->page[pos / PAGE_SIZE]; if (!page && write) { page = alloc_page(GFP_HIGHUSER|__GFP_ZERO); if (!page) return NULL; bprm->page[pos / PAGE_SIZE] = page; } return page; } static void put_arg_page(struct page *page) { } static void free_arg_page(struct linux_binprm *bprm, int i) { if (bprm->page[i]) { __free_page(bprm->page[i]); bprm->page[i] = NULL; } } static void free_arg_pages(struct linux_binprm *bprm) { int i; for (i = 0; i < MAX_ARG_PAGES; i++) free_arg_page(bprm, i); } static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, struct page *page) { } static int __bprm_mm_init(struct linux_binprm *bprm) { bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *); return 0; } static bool valid_arg_len(struct linux_binprm *bprm, long len) { return len <= bprm->p; } #endif /* CONFIG_MMU */ /* * Create a new mm_struct and populate it with a temporary stack * vm_area_struct. We don't have enough context at this point to set the stack * flags, permissions, and offset, so we use temporary values. We'll update * them later in setup_arg_pages(). */ static int bprm_mm_init(struct linux_binprm *bprm) { int err; struct mm_struct *mm = NULL; bprm->mm = mm = mm_alloc(); err = -ENOMEM; if (!mm) goto err; /* Save current stack limit for all calculations made during exec. */ task_lock(current->group_leader); bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK]; task_unlock(current->group_leader); err = __bprm_mm_init(bprm); if (err) goto err; return 0; err: if (mm) { bprm->mm = NULL; mmdrop(mm); } return err; } struct user_arg_ptr { #ifdef CONFIG_COMPAT bool is_compat; #endif union { const char __user *const __user *native; #ifdef CONFIG_COMPAT const compat_uptr_t __user *compat; #endif } ptr; }; static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr) { const char __user *native; #ifdef CONFIG_COMPAT if (unlikely(argv.is_compat)) { compat_uptr_t compat; if (get_user(compat, argv.ptr.compat + nr)) return ERR_PTR(-EFAULT); return compat_ptr(compat); } #endif if (get_user(native, argv.ptr.native + nr)) return ERR_PTR(-EFAULT); return native; } /* * count() counts the number of strings in array ARGV. */ static int count(struct user_arg_ptr argv, int max) { int i = 0; if (argv.ptr.native != NULL) { for (;;) { const char __user *p = get_user_arg_ptr(argv, i); if (!p) break; if (IS_ERR(p)) return -EFAULT; if (i >= max) return -E2BIG; ++i; if (fatal_signal_pending(current)) return -ERESTARTNOHAND; cond_resched(); } } return i; } static int count_strings_kernel(const char *const *argv) { int i; if (!argv) return 0; for (i = 0; argv[i]; ++i) { if (i >= MAX_ARG_STRINGS) return -E2BIG; if (fatal_signal_pending(current)) return -ERESTARTNOHAND; cond_resched(); } return i; } static inline int bprm_set_stack_limit(struct linux_binprm *bprm, unsigned long limit) { #ifdef CONFIG_MMU /* Avoid a pathological bprm->p. */ if (bprm->p < limit) return -E2BIG; bprm->argmin = bprm->p - limit; #endif return 0; } static inline bool bprm_hit_stack_limit(struct linux_binprm *bprm) { #ifdef CONFIG_MMU return bprm->p < bprm->argmin; #else return false; #endif } /* * Calculate bprm->argmin from: * - _STK_LIM * - ARG_MAX * - bprm->rlim_stack.rlim_cur * - bprm->argc * - bprm->envc * - bprm->p */ static int bprm_stack_limits(struct linux_binprm *bprm) { unsigned long limit, ptr_size; /* * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM * (whichever is smaller) for the argv+env strings. * This ensures that: * - the remaining binfmt code will not run out of stack space, * - the program will have a reasonable amount of stack left * to work from. */ limit = _STK_LIM / 4 * 3; limit = min(limit, bprm->rlim_stack.rlim_cur / 4); /* * We've historically supported up to 32 pages (ARG_MAX) * of argument strings even with small stacks */ limit = max_t(unsigned long, limit, ARG_MAX); /* Reject totally pathological counts. */ if (bprm->argc < 0 || bprm->envc < 0) return -E2BIG; /* * We must account for the size of all the argv and envp pointers to * the argv and envp strings, since they will also take up space in * the stack. They aren't stored until much later when we can't * signal to the parent that the child has run out of stack space. * Instead, calculate it here so it's possible to fail gracefully. * * In the case of argc = 0, make sure there is space for adding a * empty string (which will bump argc to 1), to ensure confused * userspace programs don't start processing from argv[1], thinking * argc can never be 0, to keep them from walking envp by accident. * See do_execveat_common(). */ if (check_add_overflow(max(bprm->argc, 1), bprm->envc, &ptr_size) || check_mul_overflow(ptr_size, sizeof(void *), &ptr_size)) return -E2BIG; if (limit <= ptr_size) return -E2BIG; limit -= ptr_size; return bprm_set_stack_limit(bprm, limit); } /* * 'copy_strings()' copies argument/environment strings from the old * processes's memory to the new process's stack. The call to get_user_pages() * ensures the destination page is created and not swapped out. */ static int copy_strings(int argc, struct user_arg_ptr argv, struct linux_binprm *bprm) { struct page *kmapped_page = NULL; char *kaddr = NULL; unsigned long kpos = 0; int ret; while (argc-- > 0) { const char __user *str; int len; unsigned long pos; ret = -EFAULT; str = get_user_arg_ptr(argv, argc); if (IS_ERR(str)) goto out; len = strnlen_user(str, MAX_ARG_STRLEN); if (!len) goto out; ret = -E2BIG; if (!valid_arg_len(bprm, len)) goto out; /* We're going to work our way backwards. */ pos = bprm->p; str += len; bprm->p -= len; if (bprm_hit_stack_limit(bprm)) goto out; while (len > 0) { int offset, bytes_to_copy; if (fatal_signal_pending(current)) { ret = -ERESTARTNOHAND; goto out; } cond_resched(); offset = pos % PAGE_SIZE; if (offset == 0) offset = PAGE_SIZE; bytes_to_copy = offset; if (bytes_to_copy > len) bytes_to_copy = len; offset -= bytes_to_copy; pos -= bytes_to_copy; str -= bytes_to_copy; len -= bytes_to_copy; if (!kmapped_page || kpos != (pos & PAGE_MASK)) { struct page *page; page = get_arg_page(bprm, pos, 1); if (!page) { ret = -E2BIG; goto out; } if (kmapped_page) { flush_dcache_page(kmapped_page); kunmap_local(kaddr); put_arg_page(kmapped_page); } kmapped_page = page; kaddr = kmap_local_page(kmapped_page); kpos = pos & PAGE_MASK; flush_arg_page(bprm, kpos, kmapped_page); } if (copy_from_user(kaddr+offset, str, bytes_to_copy)) { ret = -EFAULT; goto out; } } } ret = 0; out: if (kmapped_page) { flush_dcache_page(kmapped_page); kunmap_local(kaddr); put_arg_page(kmapped_page); } return ret; } /* * Copy and argument/environment string from the kernel to the processes stack. */ int copy_string_kernel(const char *arg, struct linux_binprm *bprm) { int len = strnlen(arg, MAX_ARG_STRLEN) + 1 /* terminating NUL */; unsigned long pos = bprm->p; if (len == 0) return -EFAULT; if (!valid_arg_len(bprm, len)) return -E2BIG; /* We're going to work our way backwards. */ arg += len; bprm->p -= len; if (bprm_hit_stack_limit(bprm)) return -E2BIG; while (len > 0) { unsigned int bytes_to_copy = min_t(unsigned int, len, min_not_zero(offset_in_page(pos), PAGE_SIZE)); struct page *page; pos -= bytes_to_copy; arg -= bytes_to_copy; len -= bytes_to_copy; page = get_arg_page(bprm, pos, 1); if (!page) return -E2BIG; flush_arg_page(bprm, pos & PAGE_MASK, page); memcpy_to_page(page, offset_in_page(pos), arg, bytes_to_copy); put_arg_page(page); } return 0; } EXPORT_SYMBOL(copy_string_kernel); static int copy_strings_kernel(int argc, const char *const *argv, struct linux_binprm *bprm) { while (argc-- > 0) { int ret = copy_string_kernel(argv[argc], bprm); if (ret < 0) return ret; if (fatal_signal_pending(current)) return -ERESTARTNOHAND; cond_resched(); } return 0; } #ifdef CONFIG_MMU /* * Finalizes the stack vm_area_struct. The flags and permissions are updated, * the stack is optionally relocated, and some extra space is added. */ int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack) { unsigned long ret; unsigned long stack_shift; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = bprm->vma; struct vm_area_struct *prev = NULL; unsigned long vm_flags; unsigned long stack_base; unsigned long stack_size; unsigned long stack_expand; unsigned long rlim_stack; struct mmu_gather tlb; struct vma_iterator vmi; #ifdef CONFIG_STACK_GROWSUP /* Limit stack size */ stack_base = bprm->rlim_stack.rlim_max; stack_base = calc_max_stack_size(stack_base); /* Add space for stack randomization. */ if (current->flags & PF_RANDOMIZE) stack_base += (STACK_RND_MASK << PAGE_SHIFT); /* Make sure we didn't let the argument array grow too large. */ if (vma->vm_end - vma->vm_start > stack_base) return -ENOMEM; stack_base = PAGE_ALIGN(stack_top - stack_base); stack_shift = vma->vm_start - stack_base; mm->arg_start = bprm->p - stack_shift; bprm->p = vma->vm_end - stack_shift; #else stack_top = arch_align_stack(stack_top); stack_top = PAGE_ALIGN(stack_top); if (unlikely(stack_top < mmap_min_addr) || unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr)) return -ENOMEM; stack_shift = vma->vm_end - stack_top; bprm->p -= stack_shift; mm->arg_start = bprm->p; #endif if (bprm->loader) bprm->loader -= stack_shift; bprm->exec -= stack_shift; if (mmap_write_lock_killable(mm)) return -EINTR; vm_flags = VM_STACK_FLAGS; /* * Adjust stack execute permissions; explicitly enable for * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone * (arch default) otherwise. */ if (unlikely(executable_stack == EXSTACK_ENABLE_X)) vm_flags |= VM_EXEC; else if (executable_stack == EXSTACK_DISABLE_X) vm_flags &= ~VM_EXEC; vm_flags |= mm->def_flags; vm_flags |= VM_STACK_INCOMPLETE_SETUP; vma_iter_init(&vmi, mm, vma->vm_start); tlb_gather_mmu(&tlb, mm); ret = mprotect_fixup(&vmi, &tlb, vma, &prev, vma->vm_start, vma->vm_end, vm_flags); tlb_finish_mmu(&tlb); if (ret) goto out_unlock; BUG_ON(prev != vma); if (unlikely(vm_flags & VM_EXEC)) { pr_warn_once("process '%pD4' started with executable stack\n", bprm->file); } /* Move stack pages down in memory. */ if (stack_shift) { /* * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX. Once * the binfmt code determines where the new stack should reside, we shift it to * its final location. */ ret = relocate_vma_down(vma, stack_shift); if (ret) goto out_unlock; } /* mprotect_fixup is overkill to remove the temporary stack flags */ vm_flags_clear(vma, VM_STACK_INCOMPLETE_SETUP); stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ stack_size = vma->vm_end - vma->vm_start; /* * Align this down to a page boundary as expand_stack * will align it up. */ rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK; stack_expand = min(rlim_stack, stack_size + stack_expand); #ifdef CONFIG_STACK_GROWSUP stack_base = vma->vm_start + stack_expand; #else stack_base = vma->vm_end - stack_expand; #endif current->mm->start_stack = bprm->p; ret = expand_stack_locked(vma, stack_base); if (ret) ret = -EFAULT; out_unlock: mmap_write_unlock(mm); return ret; } EXPORT_SYMBOL(setup_arg_pages); #else /* * Transfer the program arguments and environment from the holding pages * onto the stack. The provided stack pointer is adjusted accordingly. */ int transfer_args_to_stack(struct linux_binprm *bprm, unsigned long *sp_location) { unsigned long index, stop, sp; int ret = 0; stop = bprm->p >> PAGE_SHIFT; sp = *sp_location; for (index = MAX_ARG_PAGES - 1; index >= stop; index--) { unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : 0; char *src = kmap_local_page(bprm->page[index]) + offset; sp -= PAGE_SIZE - offset; if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != 0) ret = -EFAULT; kunmap_local(src); if (ret) goto out; } bprm->exec += *sp_location - MAX_ARG_PAGES * PAGE_SIZE; *sp_location = sp; out: return ret; } EXPORT_SYMBOL(transfer_args_to_stack); #endif /* CONFIG_MMU */ /* * On success, caller must call do_close_execat() on the returned * struct file to close it. */ static struct file *do_open_execat(int fd, struct filename *name, int flags) { int err; struct file *file __free(fput) = NULL; struct open_flags open_exec_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, .acc_mode = MAY_EXEC, .intent = LOOKUP_OPEN, .lookup_flags = LOOKUP_FOLLOW, }; if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) return ERR_PTR(-EINVAL); if (flags & AT_SYMLINK_NOFOLLOW) open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW; if (flags & AT_EMPTY_PATH) open_exec_flags.lookup_flags |= LOOKUP_EMPTY; file = do_filp_open(fd, name, &open_exec_flags); if (IS_ERR(file)) return file; /* * In the past the regular type check was here. It moved to may_open() in * 633fb6ac3980 ("exec: move S_ISREG() check earlier"). Since then it is * an invariant that all non-regular files error out before we get here. */ if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) || path_noexec(&file->f_path)) return ERR_PTR(-EACCES); err = deny_write_access(file); if (err) return ERR_PTR(err); return no_free_ptr(file); } /** * open_exec - Open a path name for execution * * @name: path name to open with the intent of executing it. * * Returns ERR_PTR on failure or allocated struct file on success. * * As this is a wrapper for the internal do_open_execat(), callers * must call allow_write_access() before fput() on release. Also see * do_close_execat(). */ struct file *open_exec(const char *name) { struct filename *filename = getname_kernel(name); struct file *f = ERR_CAST(filename); if (!IS_ERR(filename)) { f = do_open_execat(AT_FDCWD, filename, 0); putname(filename); } return f; } EXPORT_SYMBOL(open_exec); #if defined(CONFIG_BINFMT_FLAT) || defined(CONFIG_BINFMT_ELF_FDPIC) ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) { ssize_t res = vfs_read(file, (void __user *)addr, len, &pos); if (res > 0) flush_icache_user_range(addr, addr + len); return res; } EXPORT_SYMBOL(read_code); #endif /* * Maps the mm_struct mm into the current task struct. * On success, this function returns with exec_update_lock * held for writing. */ static int exec_mmap(struct mm_struct *mm) { struct task_struct *tsk; struct mm_struct *old_mm, *active_mm; int ret; /* Notify parent that we're no longer interested in the old VM */ tsk = current; old_mm = current->mm; exec_mm_release(tsk, old_mm); ret = down_write_killable(&tsk->signal->exec_update_lock); if (ret) return ret; if (old_mm) { /* * If there is a pending fatal signal perhaps a signal * whose default action is to create a coredump get * out and die instead of going through with the exec. */ ret = mmap_read_lock_killable(old_mm); if (ret) { up_write(&tsk->signal->exec_update_lock); return ret; } } task_lock(tsk); membarrier_exec_mmap(mm); local_irq_disable(); active_mm = tsk->active_mm; tsk->active_mm = mm; tsk->mm = mm; mm_init_cid(mm, tsk); /* * This prevents preemption while active_mm is being loaded and * it and mm are being updated, which could cause problems for * lazy tlb mm refcounting when these are updated by context * switches. Not all architectures can handle irqs off over * activate_mm yet. */ if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); activate_mm(active_mm, mm); if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); lru_gen_add_mm(mm); task_unlock(tsk); lru_gen_use_mm(mm); if (old_mm) { mmap_read_unlock(old_mm); BUG_ON(active_mm != old_mm); setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm); mm_update_next_owner(old_mm); mmput(old_mm); return 0; } mmdrop_lazy_tlb(active_mm); return 0; } static int de_thread(struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; struct sighand_struct *oldsighand = tsk->sighand; spinlock_t *lock = &oldsighand->siglock; if (thread_group_empty(tsk)) goto no_thread_group; /* * Kill all other threads in the thread group. */ spin_lock_irq(lock); if ((sig->flags & SIGNAL_GROUP_EXIT) || sig->group_exec_task) { /* * Another group action in progress, just * return so that the signal is processed. */ spin_unlock_irq(lock); return -EAGAIN; } sig->group_exec_task = tsk; sig->notify_count = zap_other_threads(tsk); if (!thread_group_leader(tsk)) sig->notify_count--; while (sig->notify_count) { __set_current_state(TASK_KILLABLE); spin_unlock_irq(lock); schedule(); if (__fatal_signal_pending(tsk)) goto killed; spin_lock_irq(lock); } spin_unlock_irq(lock); /* * At this point all other threads have exited, all we have to * do is to wait for the thread group leader to become inactive, * and to assume its PID: */ if (!thread_group_leader(tsk)) { struct task_struct *leader = tsk->group_leader; for (;;) { cgroup_threadgroup_change_begin(tsk); write_lock_irq(&tasklist_lock); /* * Do this under tasklist_lock to ensure that * exit_notify() can't miss ->group_exec_task */ sig->notify_count = -1; if (likely(leader->exit_state)) break; __set_current_state(TASK_KILLABLE); write_unlock_irq(&tasklist_lock); cgroup_threadgroup_change_end(tsk); schedule(); if (__fatal_signal_pending(tsk)) goto killed; } /* * The only record we have of the real-time age of a * process, regardless of execs it's done, is start_time. * All the past CPU time is accumulated in signal_struct * from sister threads now dead. But in this non-leader * exec, nothing survives from the original leader thread, * whose birth marks the true age of this process now. * When we take on its identity by switching to its PID, we * also take its birthdate (always earlier than our own). */ tsk->start_time = leader->start_time; tsk->start_boottime = leader->start_boottime; BUG_ON(!same_thread_group(leader, tsk)); /* * An exec() starts a new thread group with the * TGID of the previous thread group. Rehash the * two threads with a switched PID, and release * the former thread group leader: */ /* Become a process group leader with the old leader's pid. * The old leader becomes a thread of the this thread group. */ exchange_tids(tsk, leader); transfer_pid(leader, tsk, PIDTYPE_TGID); transfer_pid(leader, tsk, PIDTYPE_PGID); transfer_pid(leader, tsk, PIDTYPE_SID); list_replace_rcu(&leader->tasks, &tsk->tasks); list_replace_init(&leader->sibling, &tsk->sibling); tsk->group_leader = tsk; leader->group_leader = tsk; tsk->exit_signal = SIGCHLD; leader->exit_signal = -1; BUG_ON(leader->exit_state != EXIT_ZOMBIE); leader->exit_state = EXIT_DEAD; /* * We are going to release_task()->ptrace_unlink() silently, * the tracer can sleep in do_wait(). EXIT_DEAD guarantees * the tracer won't block again waiting for this thread. */ if (unlikely(leader->ptrace)) __wake_up_parent(leader, leader->parent); write_unlock_irq(&tasklist_lock); cgroup_threadgroup_change_end(tsk); release_task(leader); } sig->group_exec_task = NULL; sig->notify_count = 0; no_thread_group: /* we have changed execution domain */ tsk->exit_signal = SIGCHLD; BUG_ON(!thread_group_leader(tsk)); return 0; killed: /* protects against exit_notify() and __exit_signal() */ read_lock(&tasklist_lock); sig->group_exec_task = NULL; sig->notify_count = 0; read_unlock(&tasklist_lock); return -EAGAIN; } /* * This function makes sure the current process has its own signal table, * so that flush_signal_handlers can later reset the handlers without * disturbing other processes. (Other processes might share the signal * table via the CLONE_SIGHAND option to clone().) */ static int unshare_sighand(struct task_struct *me) { struct sighand_struct *oldsighand = me->sighand; if (refcount_read(&oldsighand->count) != 1) { struct sighand_struct *newsighand; /* * This ->sighand is shared with the CLONE_SIGHAND * but not CLONE_THREAD task, switch to the new one. */ newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); if (!newsighand) return -ENOMEM; refcount_set(&newsighand->count, 1); write_lock_irq(&tasklist_lock); spin_lock(&oldsighand->siglock); memcpy(newsighand->action, oldsighand->action, sizeof(newsighand->action)); rcu_assign_pointer(me->sighand, newsighand); spin_unlock(&oldsighand->siglock); write_unlock_irq(&tasklist_lock); __cleanup_sighand(oldsighand); } return 0; } /* * These functions flushes out all traces of the currently running executable * so that a new one can be started */ void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec) { task_lock(tsk); trace_task_rename(tsk, buf); strscpy_pad(tsk->comm, buf, sizeof(tsk->comm)); task_unlock(tsk); perf_event_comm(tsk, exec); } /* * Calling this is the point of no return. None of the failures will be * seen by userspace since either the process is already taking a fatal * signal (via de_thread() or coredump), or will have SEGV raised * (after exec_mmap()) by search_binary_handler (see below). */ int begin_new_exec(struct linux_binprm * bprm) { struct task_struct *me = current; int retval; /* Once we are committed compute the creds */ retval = bprm_creds_from_file(bprm); if (retval) return retval; /* * This tracepoint marks the point before flushing the old exec where * the current task is still unchanged, but errors are fatal (point of * no return). The later "sched_process_exec" tracepoint is called after * the current task has successfully switched to the new exec. */ trace_sched_prepare_exec(current, bprm); /* * Ensure all future errors are fatal. */ bprm->point_of_no_return = true; /* * Make this the only thread in the thread group. */ retval = de_thread(me); if (retval) goto out; /* * Cancel any io_uring activity across execve */ io_uring_task_cancel(); /* Ensure the files table is not shared. */ retval = unshare_files(); if (retval) goto out; /* * Must be called _before_ exec_mmap() as bprm->mm is * not visible until then. Doing it here also ensures * we don't race against replace_mm_exe_file(). */ retval = set_mm_exe_file(bprm->mm, bprm->file); if (retval) goto out; /* If the binary is not readable then enforce mm->dumpable=0 */ would_dump(bprm, bprm->file); if (bprm->have_execfd) would_dump(bprm, bprm->executable); /* * Release all of the old mmap stuff */ acct_arg_size(bprm, 0); retval = exec_mmap(bprm->mm); if (retval) goto out; bprm->mm = NULL; retval = exec_task_namespaces(); if (retval) goto out_unlock; #ifdef CONFIG_POSIX_TIMERS spin_lock_irq(&me->sighand->siglock); posix_cpu_timers_exit(me); spin_unlock_irq(&me->sighand->siglock); exit_itimers(me); flush_itimer_signals(); #endif /* * Make the signal table private. */ retval = unshare_sighand(me); if (retval) goto out_unlock; me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_NOFREEZE | PF_NO_SETAFFINITY); flush_thread(); me->personality &= ~bprm->per_clear; clear_syscall_work_syscall_user_dispatch(me); /* * We have to apply CLOEXEC before we change whether the process is * dumpable (in setup_new_exec) to avoid a race with a process in userspace * trying to access the should-be-closed file descriptors of a process * undergoing exec(2). */ do_close_on_exec(me->files); if (bprm->secureexec) { /* Make sure parent cannot signal privileged process. */ me->pdeath_signal = 0; /* * For secureexec, reset the stack limit to sane default to * avoid bad behavior from the prior rlimits. This has to * happen before arch_pick_mmap_layout(), which examines * RLIMIT_STACK, but after the point of no return to avoid * needing to clean up the change on failure. */ if (bprm->rlim_stack.rlim_cur > _STK_LIM) bprm->rlim_stack.rlim_cur = _STK_LIM; } me->sas_ss_sp = me->sas_ss_size = 0; /* * Figure out dumpability. Note that this checking only of current * is wrong, but userspace depends on it. This should be testing * bprm->secureexec instead. */ if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP || !(uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid()))) set_dumpable(current->mm, suid_dumpable); else set_dumpable(current->mm, SUID_DUMP_USER); perf_event_exec(); __set_task_comm(me, kbasename(bprm->filename), true); /* An exec changes our domain. We are no longer part of the thread group */ WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1); flush_signal_handlers(me, 0); retval = set_cred_ucounts(bprm->cred); if (retval < 0) goto out_unlock; /* * install the new credentials for this executable */ security_bprm_committing_creds(bprm); commit_creds(bprm->cred); bprm->cred = NULL; /* * Disable monitoring for regular users * when executing setuid binaries. Must * wait until new credentials are committed * by commit_creds() above */ if (get_dumpable(me->mm) != SUID_DUMP_USER) perf_event_exit_task(me); /* * cred_guard_mutex must be held at least to this point to prevent * ptrace_attach() from altering our determination of the task's * credentials; any time after this it may be unlocked. */ security_bprm_committed_creds(bprm); /* Pass the opened binary to the interpreter. */ if (bprm->have_execfd) { retval = get_unused_fd_flags(0); if (retval < 0) goto out_unlock; fd_install(retval, bprm->executable); bprm->executable = NULL; bprm->execfd = retval; } return 0; out_unlock: up_write(&me->signal->exec_update_lock); if (!bprm->cred) mutex_unlock(&me->signal->cred_guard_mutex); out: return retval; } EXPORT_SYMBOL(begin_new_exec); void would_dump(struct linux_binprm *bprm, struct file *file) { struct inode *inode = file_inode(file); struct mnt_idmap *idmap = file_mnt_idmap(file); if (inode_permission(idmap, inode, MAY_READ) < 0) { struct user_namespace *old, *user_ns; bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; /* Ensure mm->user_ns contains the executable */ user_ns = old = bprm->mm->user_ns; while ((user_ns != &init_user_ns) && !privileged_wrt_inode_uidgid(user_ns, idmap, inode)) user_ns = user_ns->parent; if (old != user_ns) { bprm->mm->user_ns = get_user_ns(user_ns); put_user_ns(old); } } } EXPORT_SYMBOL(would_dump); void setup_new_exec(struct linux_binprm * bprm) { /* Setup things that can depend upon the personality */ struct task_struct *me = current; arch_pick_mmap_layout(me->mm, &bprm->rlim_stack); arch_setup_new_exec(); /* Set the new mm task size. We have to do that late because it may * depend on TIF_32BIT which is only updated in flush_thread() on * some architectures like powerpc */ me->mm->task_size = TASK_SIZE; up_write(&me->signal->exec_update_lock); mutex_unlock(&me->signal->cred_guard_mutex); } EXPORT_SYMBOL(setup_new_exec); /* Runs immediately before start_thread() takes over. */ void finalize_exec(struct linux_binprm *bprm) { /* Store any stack rlimit changes before starting thread. */ task_lock(current->group_leader); current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack; task_unlock(current->group_leader); } EXPORT_SYMBOL(finalize_exec); /* * Prepare credentials and lock ->cred_guard_mutex. * setup_new_exec() commits the new creds and drops the lock. * Or, if exec fails before, free_bprm() should release ->cred * and unlock. */ static int prepare_bprm_creds(struct linux_binprm *bprm) { if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex)) return -ERESTARTNOINTR; bprm->cred = prepare_exec_creds(); if (likely(bprm->cred)) return 0; mutex_unlock(¤t->signal->cred_guard_mutex); return -ENOMEM; } /* Matches do_open_execat() */ static void do_close_execat(struct file *file) { if (!file) return; allow_write_access(file); fput(file); } static void free_bprm(struct linux_binprm *bprm) { if (bprm->mm) { acct_arg_size(bprm, 0); mmput(bprm->mm); } free_arg_pages(bprm); if (bprm->cred) { mutex_unlock(¤t->signal->cred_guard_mutex); abort_creds(bprm->cred); } do_close_execat(bprm->file); if (bprm->executable) fput(bprm->executable); /* If a binfmt changed the interp, free it. */ if (bprm->interp != bprm->filename) kfree(bprm->interp); kfree(bprm->fdpath); kfree(bprm); } static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int flags) { struct linux_binprm *bprm; struct file *file; int retval = -ENOMEM; file = do_open_execat(fd, filename, flags); if (IS_ERR(file)) return ERR_CAST(file); bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); if (!bprm) { do_close_execat(file); return ERR_PTR(-ENOMEM); } bprm->file = file; if (fd == AT_FDCWD || filename->name[0] == '/') { bprm->filename = filename->name; } else { if (filename->name[0] == '\0') bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd); else bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s", fd, filename->name); if (!bprm->fdpath) goto out_free; /* * Record that a name derived from an O_CLOEXEC fd will be * inaccessible after exec. This allows the code in exec to * choose to fail when the executable is not mmaped into the * interpreter and an open file descriptor is not passed to * the interpreter. This makes for a better user experience * than having the interpreter start and then immediately fail * when it finds the executable is inaccessible. */ if (get_close_on_exec(fd)) bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; bprm->filename = bprm->fdpath; } bprm->interp = bprm->filename; retval = bprm_mm_init(bprm); if (!retval) return bprm; out_free: free_bprm(bprm); return ERR_PTR(retval); } int bprm_change_interp(const char *interp, struct linux_binprm *bprm) { /* If a binfmt changed the interp, free it first. */ if (bprm->interp != bprm->filename) kfree(bprm->interp); bprm->interp = kstrdup(interp, GFP_KERNEL); if (!bprm->interp) return -ENOMEM; return 0; } EXPORT_SYMBOL(bprm_change_interp); /* * determine how safe it is to execute the proposed program * - the caller must hold ->cred_guard_mutex to protect against * PTRACE_ATTACH or seccomp thread-sync */ static void check_unsafe_exec(struct linux_binprm *bprm) { struct task_struct *p = current, *t; unsigned n_fs; if (p->ptrace) bprm->unsafe |= LSM_UNSAFE_PTRACE; /* * This isn't strictly necessary, but it makes it harder for LSMs to * mess up. */ if (task_no_new_privs(current)) bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS; /* * If another task is sharing our fs, we cannot safely * suid exec because the differently privileged task * will be able to manipulate the current directory, etc. * It would be nice to force an unshare instead... */ n_fs = 1; spin_lock(&p->fs->lock); rcu_read_lock(); for_other_threads(p, t) { if (t->fs == p->fs) n_fs++; } rcu_read_unlock(); /* "users" and "in_exec" locked for copy_fs() */ if (p->fs->users > n_fs) bprm->unsafe |= LSM_UNSAFE_SHARE; else p->fs->in_exec = 1; spin_unlock(&p->fs->lock); } static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) { /* Handle suid and sgid on files */ struct mnt_idmap *idmap; struct inode *inode = file_inode(file); unsigned int mode; vfsuid_t vfsuid; vfsgid_t vfsgid; int err; if (!mnt_may_suid(file->f_path.mnt)) return; if (task_no_new_privs(current)) return; mode = READ_ONCE(inode->i_mode); if (!(mode & (S_ISUID|S_ISGID))) return; idmap = file_mnt_idmap(file); /* Be careful if suid/sgid is set */ inode_lock(inode); /* Atomically reload and check mode/uid/gid now that lock held. */ mode = inode->i_mode; vfsuid = i_uid_into_vfsuid(idmap, inode); vfsgid = i_gid_into_vfsgid(idmap, inode); err = inode_permission(idmap, inode, MAY_EXEC); inode_unlock(inode); /* Did the exec bit vanish out from under us? Give up. */ if (err) return; /* We ignore suid/sgid if there are no mappings for them in the ns */ if (!vfsuid_has_mapping(bprm->cred->user_ns, vfsuid) || !vfsgid_has_mapping(bprm->cred->user_ns, vfsgid)) return; if (mode & S_ISUID) { bprm->per_clear |= PER_CLEAR_ON_SETID; bprm->cred->euid = vfsuid_into_kuid(vfsuid); } if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { bprm->per_clear |= PER_CLEAR_ON_SETID; bprm->cred->egid = vfsgid_into_kgid(vfsgid); } } /* * Compute brpm->cred based upon the final binary. */ static int bprm_creds_from_file(struct linux_binprm *bprm) { /* Compute creds based on which file? */ struct file *file = bprm->execfd_creds ? bprm->executable : bprm->file; bprm_fill_uid(bprm, file); return security_bprm_creds_from_file(bprm, file); } /* * Fill the binprm structure from the inode. * Read the first BINPRM_BUF_SIZE bytes * * This may be called multiple times for binary chains (scripts for example). */ static int prepare_binprm(struct linux_binprm *bprm) { loff_t pos = 0; memset(bprm->buf, 0, BINPRM_BUF_SIZE); return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos); } /* * Arguments are '\0' separated strings found at the location bprm->p * points to; chop off the first by relocating brpm->p to right after * the first '\0' encountered. */ int remove_arg_zero(struct linux_binprm *bprm) { unsigned long offset; char *kaddr; struct page *page; if (!bprm->argc) return 0; do { offset = bprm->p & ~PAGE_MASK; page = get_arg_page(bprm, bprm->p, 0); if (!page) return -EFAULT; kaddr = kmap_local_page(page); for (; offset < PAGE_SIZE && kaddr[offset]; offset++, bprm->p++) ; kunmap_local(kaddr); put_arg_page(page); } while (offset == PAGE_SIZE); bprm->p++; bprm->argc--; return 0; } EXPORT_SYMBOL(remove_arg_zero); #define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e)) /* * cycle the list of binary formats handler, until one recognizes the image */ static int search_binary_handler(struct linux_binprm *bprm) { bool need_retry = IS_ENABLED(CONFIG_MODULES); struct linux_binfmt *fmt; int retval; retval = prepare_binprm(bprm); if (retval < 0) return retval; retval = security_bprm_check(bprm); if (retval) return retval; retval = -ENOENT; retry: read_lock(&binfmt_lock); list_for_each_entry(fmt, &formats, lh) { if (!try_module_get(fmt->module)) continue; read_unlock(&binfmt_lock); retval = fmt->load_binary(bprm); read_lock(&binfmt_lock); put_binfmt(fmt); if (bprm->point_of_no_return || (retval != -ENOEXEC)) { read_unlock(&binfmt_lock); return retval; } } read_unlock(&binfmt_lock); if (need_retry) { if (printable(bprm->buf[0]) && printable(bprm->buf[1]) && printable(bprm->buf[2]) && printable(bprm->buf[3])) return retval; if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0) return retval; need_retry = false; goto retry; } return retval; } /* binfmt handlers will call back into begin_new_exec() on success. */ static int exec_binprm(struct linux_binprm *bprm) { pid_t old_pid, old_vpid; int ret, depth; /* Need to fetch pid before load_binary changes it */ old_pid = current->pid; rcu_read_lock(); old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent)); rcu_read_unlock(); /* This allows 4 levels of binfmt rewrites before failing hard. */ for (depth = 0;; depth++) { struct file *exec; if (depth > 5) return -ELOOP; ret = search_binary_handler(bprm); if (ret < 0) return ret; if (!bprm->interpreter) break; exec = bprm->file; bprm->file = bprm->interpreter; bprm->interpreter = NULL; allow_write_access(exec); if (unlikely(bprm->have_execfd)) { if (bprm->executable) { fput(exec); return -ENOEXEC; } bprm->executable = exec; } else fput(exec); } audit_bprm(bprm); trace_sched_process_exec(current, old_pid, bprm); ptrace_event(PTRACE_EVENT_EXEC, old_vpid); proc_exec_connector(current); return 0; } static int bprm_execve(struct linux_binprm *bprm) { int retval; retval = prepare_bprm_creds(bprm); if (retval) return retval; /* * Check for unsafe execution states before exec_binprm(), which * will call back into begin_new_exec(), into bprm_creds_from_file(), * where setuid-ness is evaluated. */ check_unsafe_exec(bprm); current->in_execve = 1; sched_mm_cid_before_execve(current); sched_exec(); /* Set the unchanging part of bprm->cred */ retval = security_bprm_creds_for_exec(bprm); if (retval) goto out; retval = exec_binprm(bprm); if (retval < 0) goto out; sched_mm_cid_after_execve(current); /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; rseq_execve(current); user_events_execve(current); acct_update_integrals(current); task_numa_free(current, false); return retval; out: /* * If past the point of no return ensure the code never * returns to the userspace process. Use an existing fatal * signal if present otherwise terminate the process with * SIGSEGV. */ if (bprm->point_of_no_return && !fatal_signal_pending(current)) force_fatal_sig(SIGSEGV); sched_mm_cid_after_execve(current); current->fs->in_exec = 0; current->in_execve = 0; return retval; } static int do_execveat_common(int fd, struct filename *filename, struct user_arg_ptr argv, struct user_arg_ptr envp, int flags) { struct linux_binprm *bprm; int retval; if (IS_ERR(filename)) return PTR_ERR(filename); /* * We move the actual failure in case of RLIMIT_NPROC excess from * set*uid() to execve() because too many poorly written programs * don't check setuid() return code. Here we additionally recheck * whether NPROC limit is still exceeded. */ if ((current->flags & PF_NPROC_EXCEEDED) && is_rlimit_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { retval = -EAGAIN; goto out_ret; } /* We're below the limit (still or again), so we don't want to make * further execve() calls fail. */ current->flags &= ~PF_NPROC_EXCEEDED; bprm = alloc_bprm(fd, filename, flags); if (IS_ERR(bprm)) { retval = PTR_ERR(bprm); goto out_ret; } retval = count(argv, MAX_ARG_STRINGS); if (retval == 0) pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n", current->comm, bprm->filename); if (retval < 0) goto out_free; bprm->argc = retval; retval = count(envp, MAX_ARG_STRINGS); if (retval < 0) goto out_free; bprm->envc = retval; retval = bprm_stack_limits(bprm); if (retval < 0) goto out_free; retval = copy_string_kernel(bprm->filename, bprm); if (retval < 0) goto out_free; bprm->exec = bprm->p; retval = copy_strings(bprm->envc, envp, bprm); if (retval < 0) goto out_free; retval = copy_strings(bprm->argc, argv, bprm); if (retval < 0) goto out_free; /* * When argv is empty, add an empty string ("") as argv[0] to * ensure confused userspace programs that start processing * from argv[1] won't end up walking envp. See also * bprm_stack_limits(). */ if (bprm->argc == 0) { retval = copy_string_kernel("", bprm); if (retval < 0) goto out_free; bprm->argc = 1; } retval = bprm_execve(bprm); out_free: free_bprm(bprm); out_ret: putname(filename); return retval; } int kernel_execve(const char *kernel_filename, const char *const *argv, const char *const *envp) { struct filename *filename; struct linux_binprm *bprm; int fd = AT_FDCWD; int retval; /* It is non-sense for kernel threads to call execve */ if (WARN_ON_ONCE(current->flags & PF_KTHREAD)) return -EINVAL; filename = getname_kernel(kernel_filename); if (IS_ERR(filename)) return PTR_ERR(filename); bprm = alloc_bprm(fd, filename, 0); if (IS_ERR(bprm)) { retval = PTR_ERR(bprm); goto out_ret; } retval = count_strings_kernel(argv); if (WARN_ON_ONCE(retval == 0)) retval = -EINVAL; if (retval < 0) goto out_free; bprm->argc = retval; retval = count_strings_kernel(envp); if (retval < 0) goto out_free; bprm->envc = retval; retval = bprm_stack_limits(bprm); if (retval < 0) goto out_free; retval = copy_string_kernel(bprm->filename, bprm); if (retval < 0) goto out_free; bprm->exec = bprm->p; retval = copy_strings_kernel(bprm->envc, envp, bprm); if (retval < 0) goto out_free; retval = copy_strings_kernel(bprm->argc, argv, bprm); if (retval < 0) goto out_free; retval = bprm_execve(bprm); out_free: free_bprm(bprm); out_ret: putname(filename); return retval; } static int do_execve(struct filename *filename, const char __user *const __user *__argv, const char __user *const __user *__envp) { struct user_arg_ptr argv = { .ptr.native = __argv }; struct user_arg_ptr envp = { .ptr.native = __envp }; return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); } static int do_execveat(int fd, struct filename *filename, const char __user *const __user *__argv, const char __user *const __user *__envp, int flags) { struct user_arg_ptr argv = { .ptr.native = __argv }; struct user_arg_ptr envp = { .ptr.native = __envp }; return do_execveat_common(fd, filename, argv, envp, flags); } #ifdef CONFIG_COMPAT static int compat_do_execve(struct filename *filename, const compat_uptr_t __user *__argv, const compat_uptr_t __user *__envp) { struct user_arg_ptr argv = { .is_compat = true, .ptr.compat = __argv, }; struct user_arg_ptr envp = { .is_compat = true, .ptr.compat = __envp, }; return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); } static int compat_do_execveat(int fd, struct filename *filename, const compat_uptr_t __user *__argv, const compat_uptr_t __user *__envp, int flags) { struct user_arg_ptr argv = { .is_compat = true, .ptr.compat = __argv, }; struct user_arg_ptr envp = { .is_compat = true, .ptr.compat = __envp, }; return do_execveat_common(fd, filename, argv, envp, flags); } #endif void set_binfmt(struct linux_binfmt *new) { struct mm_struct *mm = current->mm; if (mm->binfmt) module_put(mm->binfmt->module); mm->binfmt = new; if (new) __module_get(new->module); } EXPORT_SYMBOL(set_binfmt); /* * set_dumpable stores three-value SUID_DUMP_* into mm->flags. */ void set_dumpable(struct mm_struct *mm, int value) { if (WARN_ON((unsigned)value > SUID_DUMP_ROOT)) return; set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value); } SYSCALL_DEFINE3(execve, const char __user *, filename, const char __user *const __user *, argv, const char __user *const __user *, envp) { return do_execve(getname(filename), argv, envp); } SYSCALL_DEFINE5(execveat, int, fd, const char __user *, filename, const char __user *const __user *, argv, const char __user *const __user *, envp, int, flags) { return do_execveat(fd, getname_uflags(filename, flags), argv, envp, flags); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename, const compat_uptr_t __user *, argv, const compat_uptr_t __user *, envp) { return compat_do_execve(getname(filename), argv, envp); } COMPAT_SYSCALL_DEFINE5(execveat, int, fd, const char __user *, filename, const compat_uptr_t __user *, argv, const compat_uptr_t __user *, envp, int, flags) { return compat_do_execveat(fd, getname_uflags(filename, flags), argv, envp, flags); } #endif #ifdef CONFIG_SYSCTL static int proc_dointvec_minmax_coredump(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!error) validate_coredump_safety(); return error; } static struct ctl_table fs_exec_sysctls[] = { { .procname = "suid_dumpable", .data = &suid_dumpable, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax_coredump, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, }; static int __init init_fs_exec_sysctls(void) { register_sysctl_init("fs", fs_exec_sysctls); return 0; } fs_initcall(init_fs_exec_sysctls); #endif /* CONFIG_SYSCTL */ #ifdef CONFIG_EXEC_KUNIT_TEST #include "tests/exec_kunit.c" #endif |
| 3 3 3 3 3 56 2 121 122 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 | // SPDX-License-Identifier: GPL-2.0-or-later /* * ldm - Support for Windows Logical Disk Manager (Dynamic Disks) * * Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org> * Copyright (c) 2001-2012 Anton Altaparmakov * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com> * * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads */ #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/stringify.h> #include <linux/kernel.h> #include <linux/uuid.h> #include <linux/msdos_partition.h> #include "ldm.h" #include "check.h" /* * ldm_debug/info/error/crit - Output an error message * @f: A printf format string containing the message * @...: Variables to substitute into @f * * ldm_debug() writes a DEBUG level message to the syslog but only if the * driver was compiled with debug enabled. Otherwise, the call turns into a NOP. */ #ifndef CONFIG_LDM_DEBUG #define ldm_debug(...) do {} while (0) #else #define ldm_debug(f, a...) _ldm_printk (KERN_DEBUG, __func__, f, ##a) #endif #define ldm_crit(f, a...) _ldm_printk (KERN_CRIT, __func__, f, ##a) #define ldm_error(f, a...) _ldm_printk (KERN_ERR, __func__, f, ##a) #define ldm_info(f, a...) _ldm_printk (KERN_INFO, __func__, f, ##a) static __printf(3, 4) void _ldm_printk(const char *level, const char *function, const char *fmt, ...) { struct va_format vaf; va_list args; va_start (args, fmt); vaf.fmt = fmt; vaf.va = &args; printk("%s%s(): %pV\n", level, function, &vaf); va_end(args); } /** * ldm_parse_privhead - Read the LDM Database PRIVHEAD structure * @data: Raw database PRIVHEAD structure loaded from the device * @ph: In-memory privhead structure in which to return parsed information * * This parses the LDM database PRIVHEAD structure supplied in @data and * sets up the in-memory privhead structure @ph with the obtained information. * * Return: 'true' @ph contains the PRIVHEAD data * 'false' @ph contents are undefined */ static bool ldm_parse_privhead(const u8 *data, struct privhead *ph) { bool is_vista = false; BUG_ON(!data || !ph); if (MAGIC_PRIVHEAD != get_unaligned_be64(data)) { ldm_error("Cannot find PRIVHEAD structure. LDM database is" " corrupt. Aborting."); return false; } ph->ver_major = get_unaligned_be16(data + 0x000C); ph->ver_minor = get_unaligned_be16(data + 0x000E); ph->logical_disk_start = get_unaligned_be64(data + 0x011B); ph->logical_disk_size = get_unaligned_be64(data + 0x0123); ph->config_start = get_unaligned_be64(data + 0x012B); ph->config_size = get_unaligned_be64(data + 0x0133); /* Version 2.11 is Win2k/XP and version 2.12 is Vista. */ if (ph->ver_major == 2 && ph->ver_minor == 12) is_vista = true; if (!is_vista && (ph->ver_major != 2 || ph->ver_minor != 11)) { ldm_error("Expected PRIVHEAD version 2.11 or 2.12, got %d.%d." " Aborting.", ph->ver_major, ph->ver_minor); return false; } ldm_debug("PRIVHEAD version %d.%d (Windows %s).", ph->ver_major, ph->ver_minor, is_vista ? "Vista" : "2000/XP"); if (ph->config_size != LDM_DB_SIZE) { /* 1 MiB in sectors. */ /* Warn the user and continue, carefully. */ ldm_info("Database is normally %u bytes, it claims to " "be %llu bytes.", LDM_DB_SIZE, (unsigned long long)ph->config_size); } if ((ph->logical_disk_size == 0) || (ph->logical_disk_start + ph->logical_disk_size > ph->config_start)) { ldm_error("PRIVHEAD disk size doesn't match real disk size"); return false; } if (uuid_parse(data + 0x0030, &ph->disk_id)) { ldm_error("PRIVHEAD contains an invalid GUID."); return false; } ldm_debug("Parsed PRIVHEAD successfully."); return true; } /** * ldm_parse_tocblock - Read the LDM Database TOCBLOCK structure * @data: Raw database TOCBLOCK structure loaded from the device * @toc: In-memory toc structure in which to return parsed information * * This parses the LDM Database TOCBLOCK (table of contents) structure supplied * in @data and sets up the in-memory tocblock structure @toc with the obtained * information. * * N.B. The *_start and *_size values returned in @toc are not range-checked. * * Return: 'true' @toc contains the TOCBLOCK data * 'false' @toc contents are undefined */ static bool ldm_parse_tocblock (const u8 *data, struct tocblock *toc) { BUG_ON (!data || !toc); if (MAGIC_TOCBLOCK != get_unaligned_be64(data)) { ldm_crit ("Cannot find TOCBLOCK, database may be corrupt."); return false; } strscpy_pad(toc->bitmap1_name, data + 0x24, sizeof(toc->bitmap1_name)); toc->bitmap1_start = get_unaligned_be64(data + 0x2E); toc->bitmap1_size = get_unaligned_be64(data + 0x36); if (strncmp (toc->bitmap1_name, TOC_BITMAP1, sizeof (toc->bitmap1_name)) != 0) { ldm_crit ("TOCBLOCK's first bitmap is '%s', should be '%s'.", TOC_BITMAP1, toc->bitmap1_name); return false; } strscpy_pad(toc->bitmap2_name, data + 0x46, sizeof(toc->bitmap2_name)); toc->bitmap2_start = get_unaligned_be64(data + 0x50); toc->bitmap2_size = get_unaligned_be64(data + 0x58); if (strncmp (toc->bitmap2_name, TOC_BITMAP2, sizeof (toc->bitmap2_name)) != 0) { ldm_crit ("TOCBLOCK's second bitmap is '%s', should be '%s'.", TOC_BITMAP2, toc->bitmap2_name); return false; } ldm_debug ("Parsed TOCBLOCK successfully."); return true; } /** * ldm_parse_vmdb - Read the LDM Database VMDB structure * @data: Raw database VMDB structure loaded from the device * @vm: In-memory vmdb structure in which to return parsed information * * This parses the LDM Database VMDB structure supplied in @data and sets up * the in-memory vmdb structure @vm with the obtained information. * * N.B. The *_start, *_size and *_seq values will be range-checked later. * * Return: 'true' @vm contains VMDB info * 'false' @vm contents are undefined */ static bool ldm_parse_vmdb (const u8 *data, struct vmdb *vm) { BUG_ON (!data || !vm); if (MAGIC_VMDB != get_unaligned_be32(data)) { ldm_crit ("Cannot find the VMDB, database may be corrupt."); return false; } vm->ver_major = get_unaligned_be16(data + 0x12); vm->ver_minor = get_unaligned_be16(data + 0x14); if ((vm->ver_major != 4) || (vm->ver_minor != 10)) { ldm_error ("Expected VMDB version %d.%d, got %d.%d. " "Aborting.", 4, 10, vm->ver_major, vm->ver_minor); return false; } vm->vblk_size = get_unaligned_be32(data + 0x08); if (vm->vblk_size == 0) { ldm_error ("Illegal VBLK size"); return false; } vm->vblk_offset = get_unaligned_be32(data + 0x0C); vm->last_vblk_seq = get_unaligned_be32(data + 0x04); ldm_debug ("Parsed VMDB successfully."); return true; } /** * ldm_compare_privheads - Compare two privhead objects * @ph1: First privhead * @ph2: Second privhead * * This compares the two privhead structures @ph1 and @ph2. * * Return: 'true' Identical * 'false' Different */ static bool ldm_compare_privheads (const struct privhead *ph1, const struct privhead *ph2) { BUG_ON (!ph1 || !ph2); return ((ph1->ver_major == ph2->ver_major) && (ph1->ver_minor == ph2->ver_minor) && (ph1->logical_disk_start == ph2->logical_disk_start) && (ph1->logical_disk_size == ph2->logical_disk_size) && (ph1->config_start == ph2->config_start) && (ph1->config_size == ph2->config_size) && uuid_equal(&ph1->disk_id, &ph2->disk_id)); } /** * ldm_compare_tocblocks - Compare two tocblock objects * @toc1: First toc * @toc2: Second toc * * This compares the two tocblock structures @toc1 and @toc2. * * Return: 'true' Identical * 'false' Different */ static bool ldm_compare_tocblocks (const struct tocblock *toc1, const struct tocblock *toc2) { BUG_ON (!toc1 || !toc2); return ((toc1->bitmap1_start == toc2->bitmap1_start) && (toc1->bitmap1_size == toc2->bitmap1_size) && (toc1->bitmap2_start == toc2->bitmap2_start) && (toc1->bitmap2_size == toc2->bitmap2_size) && !strncmp (toc1->bitmap1_name, toc2->bitmap1_name, sizeof (toc1->bitmap1_name)) && !strncmp (toc1->bitmap2_name, toc2->bitmap2_name, sizeof (toc1->bitmap2_name))); } /** * ldm_validate_privheads - Compare the primary privhead with its backups * @state: Partition check state including device holding the LDM Database * @ph1: Memory struct to fill with ph contents * * Read and compare all three privheads from disk. * * The privheads on disk show the size and location of the main disk area and * the configuration area (the database). The values are range-checked against * @hd, which contains the real size of the disk. * * Return: 'true' Success * 'false' Error */ static bool ldm_validate_privheads(struct parsed_partitions *state, struct privhead *ph1) { static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 }; struct privhead *ph[3] = { ph1 }; Sector sect; u8 *data; bool result = false; long num_sects; int i; BUG_ON (!state || !ph1); ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL); ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL); if (!ph[1] || !ph[2]) { ldm_crit ("Out of memory."); goto out; } /* off[1 & 2] are relative to ph[0]->config_start */ ph[0]->config_start = 0; /* Read and parse privheads */ for (i = 0; i < 3; i++) { data = read_part_sector(state, ph[0]->config_start + off[i], §); if (!data) { ldm_crit ("Disk read failed."); goto out; } result = ldm_parse_privhead (data, ph[i]); put_dev_sector (sect); if (!result) { ldm_error ("Cannot find PRIVHEAD %d.", i+1); /* Log again */ if (i < 2) goto out; /* Already logged */ else break; /* FIXME ignore for now, 3rd PH can fail on odd-sized disks */ } } num_sects = get_capacity(state->disk); if ((ph[0]->config_start > num_sects) || ((ph[0]->config_start + ph[0]->config_size) > num_sects)) { ldm_crit ("Database extends beyond the end of the disk."); goto out; } if ((ph[0]->logical_disk_start > ph[0]->config_start) || ((ph[0]->logical_disk_start + ph[0]->logical_disk_size) > ph[0]->config_start)) { ldm_crit ("Disk and database overlap."); goto out; } if (!ldm_compare_privheads (ph[0], ph[1])) { ldm_crit ("Primary and backup PRIVHEADs don't match."); goto out; } /* FIXME ignore this for now if (!ldm_compare_privheads (ph[0], ph[2])) { ldm_crit ("Primary and backup PRIVHEADs don't match."); goto out; }*/ ldm_debug ("Validated PRIVHEADs successfully."); result = true; out: kfree (ph[1]); kfree (ph[2]); return result; } /** * ldm_validate_tocblocks - Validate the table of contents and its backups * @state: Partition check state including device holding the LDM Database * @base: Offset, into @state->disk, of the database * @ldb: Cache of the database structures * * Find and compare the four tables of contents of the LDM Database stored on * @state->disk and return the parsed information into @toc1. * * The offsets and sizes of the configs are range-checked against a privhead. * * Return: 'true' @toc1 contains validated TOCBLOCK info * 'false' @toc1 contents are undefined */ static bool ldm_validate_tocblocks(struct parsed_partitions *state, unsigned long base, struct ldmdb *ldb) { static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4}; struct tocblock *tb[4]; struct privhead *ph; Sector sect; u8 *data; int i, nr_tbs; bool result = false; BUG_ON(!state || !ldb); ph = &ldb->ph; tb[0] = &ldb->toc; tb[1] = kmalloc_array(3, sizeof(*tb[1]), GFP_KERNEL); if (!tb[1]) { ldm_crit("Out of memory."); goto err; } tb[2] = (struct tocblock*)((u8*)tb[1] + sizeof(*tb[1])); tb[3] = (struct tocblock*)((u8*)tb[2] + sizeof(*tb[2])); /* * Try to read and parse all four TOCBLOCKs. * * Windows Vista LDM v2.12 does not always have all four TOCBLOCKs so * skip any that fail as long as we get at least one valid TOCBLOCK. */ for (nr_tbs = i = 0; i < 4; i++) { data = read_part_sector(state, base + off[i], §); if (!data) { ldm_error("Disk read failed for TOCBLOCK %d.", i); continue; } if (ldm_parse_tocblock(data, tb[nr_tbs])) nr_tbs++; put_dev_sector(sect); } if (!nr_tbs) { ldm_crit("Failed to find a valid TOCBLOCK."); goto err; } /* Range check the TOCBLOCK against a privhead. */ if (((tb[0]->bitmap1_start + tb[0]->bitmap1_size) > ph->config_size) || ((tb[0]->bitmap2_start + tb[0]->bitmap2_size) > ph->config_size)) { ldm_crit("The bitmaps are out of range. Giving up."); goto err; } /* Compare all loaded TOCBLOCKs. */ for (i = 1; i < nr_tbs; i++) { if (!ldm_compare_tocblocks(tb[0], tb[i])) { ldm_crit("TOCBLOCKs 0 and %d do not match.", i); goto err; } } ldm_debug("Validated %d TOCBLOCKs successfully.", nr_tbs); result = true; err: kfree(tb[1]); return result; } /** * ldm_validate_vmdb - Read the VMDB and validate it * @state: Partition check state including device holding the LDM Database * @base: Offset, into @bdev, of the database * @ldb: Cache of the database structures * * Find the vmdb of the LDM Database stored on @bdev and return the parsed * information in @ldb. * * Return: 'true' @ldb contains validated VBDB info * 'false' @ldb contents are undefined */ static bool ldm_validate_vmdb(struct parsed_partitions *state, unsigned long base, struct ldmdb *ldb) { Sector sect; u8 *data; bool result = false; struct vmdb *vm; struct tocblock *toc; BUG_ON (!state || !ldb); vm = &ldb->vm; toc = &ldb->toc; data = read_part_sector(state, base + OFF_VMDB, §); if (!data) { ldm_crit ("Disk read failed."); return false; } if (!ldm_parse_vmdb (data, vm)) goto out; /* Already logged */ /* Are there uncommitted transactions? */ if (get_unaligned_be16(data + 0x10) != 0x01) { ldm_crit ("Database is not in a consistent state. Aborting."); goto out; } if (vm->vblk_offset != 512) ldm_info ("VBLKs start at offset 0x%04x.", vm->vblk_offset); /* * The last_vblkd_seq can be before the end of the vmdb, just make sure * it is not out of bounds. */ if ((vm->vblk_size * vm->last_vblk_seq) > (toc->bitmap1_size << 9)) { ldm_crit ("VMDB exceeds allowed size specified by TOCBLOCK. " "Database is corrupt. Aborting."); goto out; } result = true; out: put_dev_sector (sect); return result; } /** * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk * @state: Partition check state including device holding the LDM Database * * This function provides a weak test to decide whether the device is a dynamic * disk or not. It looks for an MS-DOS-style partition table containing at * least one partition of type 0x42 (formerly SFS, now used by Windows for * dynamic disks). * * N.B. The only possible error can come from the read_part_sector and that is * only likely to happen if the underlying device is strange. If that IS * the case we should return zero to let someone else try. * * Return: 'true' @state->disk is a dynamic disk * 'false' @state->disk is not a dynamic disk, or an error occurred */ static bool ldm_validate_partition_table(struct parsed_partitions *state) { Sector sect; u8 *data; struct msdos_partition *p; int i; bool result = false; BUG_ON(!state); data = read_part_sector(state, 0, §); if (!data) { ldm_info ("Disk read failed."); return false; } if (*(__le16*) (data + 0x01FE) != cpu_to_le16 (MSDOS_LABEL_MAGIC)) goto out; p = (struct msdos_partition *)(data + 0x01BE); for (i = 0; i < 4; i++, p++) if (p->sys_ind == LDM_PARTITION) { result = true; break; } if (result) ldm_debug ("Found W2K dynamic disk partition type."); out: put_dev_sector (sect); return result; } /** * ldm_get_disk_objid - Search a linked list of vblk's for a given Disk Id * @ldb: Cache of the database structures * * The LDM Database contains a list of all partitions on all dynamic disks. * The primary PRIVHEAD, at the beginning of the physical disk, tells us * the GUID of this disk. This function searches for the GUID in a linked * list of vblk's. * * Return: Pointer, A matching vblk was found * NULL, No match, or an error */ static struct vblk * ldm_get_disk_objid (const struct ldmdb *ldb) { struct list_head *item; BUG_ON (!ldb); list_for_each (item, &ldb->v_disk) { struct vblk *v = list_entry (item, struct vblk, list); if (uuid_equal(&v->vblk.disk.disk_id, &ldb->ph.disk_id)) return v; } return NULL; } /** * ldm_create_data_partitions - Create data partitions for this device * @pp: List of the partitions parsed so far * @ldb: Cache of the database structures * * The database contains ALL the partitions for ALL disk groups, so we need to * filter out this specific disk. Using the disk's object id, we can find all * the partitions in the database that belong to this disk. * * Add each partition in our database, to the parsed_partitions structure. * * N.B. This function creates the partitions in the order it finds partition * objects in the linked list. * * Return: 'true' Partition created * 'false' Error, probably a range checking problem */ static bool ldm_create_data_partitions (struct parsed_partitions *pp, const struct ldmdb *ldb) { struct list_head *item; struct vblk *vb; struct vblk *disk; struct vblk_part *part; int part_num = 1; BUG_ON (!pp || !ldb); disk = ldm_get_disk_objid (ldb); if (!disk) { ldm_crit ("Can't find the ID of this disk in the database."); return false; } strlcat(pp->pp_buf, " [LDM]", PAGE_SIZE); /* Create the data partitions */ list_for_each (item, &ldb->v_part) { vb = list_entry (item, struct vblk, list); part = &vb->vblk.part; if (part->disk_id != disk->obj_id) continue; put_partition (pp, part_num, ldb->ph.logical_disk_start + part->start, part->size); part_num++; } strlcat(pp->pp_buf, "\n", PAGE_SIZE); return true; } /** * ldm_relative - Calculate the next relative offset * @buffer: Block of data being worked on * @buflen: Size of the block of data * @base: Size of the previous fixed width fields * @offset: Cumulative size of the previous variable-width fields * * Because many of the VBLK fields are variable-width, it's necessary * to calculate each offset based on the previous one and the length * of the field it pointed to. * * Return: -1 Error, the calculated offset exceeded the size of the buffer * n OK, a range-checked offset into buffer */ static int ldm_relative(const u8 *buffer, int buflen, int base, int offset) { base += offset; if (!buffer || offset < 0 || base > buflen) { if (!buffer) ldm_error("!buffer"); if (offset < 0) ldm_error("offset (%d) < 0", offset); if (base > buflen) ldm_error("base (%d) > buflen (%d)", base, buflen); return -1; } if (base + buffer[base] >= buflen) { ldm_error("base (%d) + buffer[base] (%d) >= buflen (%d)", base, buffer[base], buflen); return -1; } return buffer[base] + offset + 1; } /** * ldm_get_vnum - Convert a variable-width, big endian number, into cpu order * @block: Pointer to the variable-width number to convert * * Large numbers in the LDM Database are often stored in a packed format. Each * number is prefixed by a one byte width marker. All numbers in the database * are stored in big-endian byte order. This function reads one of these * numbers and returns the result * * N.B. This function DOES NOT perform any range checking, though the most * it will read is eight bytes. * * Return: n A number * 0 Zero, or an error occurred */ static u64 ldm_get_vnum (const u8 *block) { u64 tmp = 0; u8 length; BUG_ON (!block); length = *block++; if (length && length <= 8) while (length--) tmp = (tmp << 8) | *block++; else ldm_error ("Illegal length %d.", length); return tmp; } /** * ldm_get_vstr - Read a length-prefixed string into a buffer * @block: Pointer to the length marker * @buffer: Location to copy string to * @buflen: Size of the output buffer * * Many of the strings in the LDM Database are not NULL terminated. Instead * they are prefixed by a one byte length marker. This function copies one of * these strings into a buffer. * * N.B. This function DOES NOT perform any range checking on the input. * If the buffer is too small, the output will be truncated. * * Return: 0, Error and @buffer contents are undefined * n, String length in characters (excluding NULL) * buflen-1, String was truncated. */ static int ldm_get_vstr (const u8 *block, u8 *buffer, int buflen) { int length; BUG_ON (!block || !buffer); length = block[0]; if (length >= buflen) { ldm_error ("Truncating string %d -> %d.", length, buflen); length = buflen - 1; } memcpy (buffer, block + 1, length); buffer[length] = 0; return length; } /** * ldm_parse_cmp3 - Read a raw VBLK Component object into a vblk structure * @buffer: Block of data being worked on * @buflen: Size of the block of data * @vb: In-memory vblk in which to return information * * Read a raw VBLK Component object (version 3) into a vblk structure. * * Return: 'true' @vb contains a Component VBLK * 'false' @vb contents are not defined */ static bool ldm_parse_cmp3 (const u8 *buffer, int buflen, struct vblk *vb) { int r_objid, r_name, r_vstate, r_child, r_parent, r_stripe, r_cols, len; struct vblk_comp *comp; BUG_ON (!buffer || !vb); r_objid = ldm_relative (buffer, buflen, 0x18, 0); r_name = ldm_relative (buffer, buflen, 0x18, r_objid); r_vstate = ldm_relative (buffer, buflen, 0x18, r_name); r_child = ldm_relative (buffer, buflen, 0x1D, r_vstate); r_parent = ldm_relative (buffer, buflen, 0x2D, r_child); if (buffer[0x12] & VBLK_FLAG_COMP_STRIPE) { r_stripe = ldm_relative (buffer, buflen, 0x2E, r_parent); r_cols = ldm_relative (buffer, buflen, 0x2E, r_stripe); len = r_cols; } else { r_stripe = 0; len = r_parent; } if (len < 0) return false; len += VBLK_SIZE_CMP3; if (len != get_unaligned_be32(buffer + 0x14)) return false; comp = &vb->vblk.comp; ldm_get_vstr (buffer + 0x18 + r_name, comp->state, sizeof (comp->state)); comp->type = buffer[0x18 + r_vstate]; comp->children = ldm_get_vnum (buffer + 0x1D + r_vstate); comp->parent_id = ldm_get_vnum (buffer + 0x2D + r_child); comp->chunksize = r_stripe ? ldm_get_vnum (buffer+r_parent+0x2E) : 0; return true; } /** * ldm_parse_dgr3 - Read a raw VBLK Disk Group object into a vblk structure * @buffer: Block of data being worked on * @buflen: Size of the block of data * @vb: In-memory vblk in which to return information * * Read a raw VBLK Disk Group object (version 3) into a vblk structure. * * Return: 'true' @vb contains a Disk Group VBLK * 'false' @vb contents are not defined */ static int ldm_parse_dgr3 (const u8 *buffer, int buflen, struct vblk *vb) { int r_objid, r_name, r_diskid, r_id1, r_id2, len; struct vblk_dgrp *dgrp; BUG_ON (!buffer || !vb); r_objid = ldm_relative (buffer, buflen, 0x18, 0); r_name = ldm_relative (buffer, buflen, 0x18, r_objid); r_diskid = ldm_relative (buffer, buflen, 0x18, r_name); if (buffer[0x12] & VBLK_FLAG_DGR3_IDS) { r_id1 = ldm_relative (buffer, buflen, 0x24, r_diskid); r_id2 = ldm_relative (buffer, buflen, 0x24, r_id1); len = r_id2; } else len = r_diskid; if (len < 0) return false; len += VBLK_SIZE_DGR3; if (len != get_unaligned_be32(buffer + 0x14)) return false; dgrp = &vb->vblk.dgrp; ldm_get_vstr (buffer + 0x18 + r_name, dgrp->disk_id, sizeof (dgrp->disk_id)); return true; } /** * ldm_parse_dgr4 - Read a raw VBLK Disk Group object into a vblk structure * @buffer: Block of data being worked on * @buflen: Size of the block of data * @vb: In-memory vblk in which to return information * * Read a raw VBLK Disk Group object (version 4) into a vblk structure. * * Return: 'true' @vb contains a Disk Group VBLK * 'false' @vb contents are not defined */ static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb) { char buf[64]; int r_objid, r_name, r_id1, r_id2, len; BUG_ON (!buffer || !vb); r_objid = ldm_relative (buffer, buflen, 0x18, 0); r_name = ldm_relative (buffer, buflen, 0x18, r_objid); if (buffer[0x12] & VBLK_FLAG_DGR4_IDS) { r_id1 = ldm_relative (buffer, buflen, 0x44, r_name); r_id2 = ldm_relative (buffer, buflen, 0x44, r_id1); len = r_id2; } else len = r_name; if (len < 0) return false; len += VBLK_SIZE_DGR4; if (len != get_unaligned_be32(buffer + 0x14)) return false; ldm_get_vstr (buffer + 0x18 + r_objid, buf, sizeof (buf)); return true; } /** * ldm_parse_dsk3 - Read a raw VBLK Disk object into a vblk structure * @buffer: Block of data being worked on * @buflen: Size of the block of data * @vb: In-memory vblk in which to return information * * Read a raw VBLK Disk object (version 3) into a vblk structure. * * Return: 'true' @vb contains a Disk VBLK * 'false' @vb contents are not defined */ static bool ldm_parse_dsk3 (const u8 *buffer, int buflen, struct vblk *vb) { int r_objid, r_name, r_diskid, r_altname, len; struct vblk_disk *disk; BUG_ON (!buffer || !vb); r_objid = ldm_relative (buffer, buflen, 0x18, 0); r_name = ldm_relative (buffer, buflen, 0x18, r_objid); r_diskid = ldm_relative (buffer, buflen, 0x18, r_name); r_altname = ldm_relative (buffer, buflen, 0x18, r_diskid); len = r_altname; if (len < 0) return false; len += VBLK_SIZE_DSK3; if (len != get_unaligned_be32(buffer + 0x14)) return false; disk = &vb->vblk.disk; ldm_get_vstr (buffer + 0x18 + r_diskid, disk->alt_name, sizeof (disk->alt_name)); if (uuid_parse(buffer + 0x19 + r_name, &disk->disk_id)) return false; return true; } /** * ldm_parse_dsk4 - Read a raw VBLK Disk object into a vblk structure * @buffer: Block of data being worked on * @buflen: Size of the block of data * @vb: In-memory vblk in which to return information * * Read a raw VBLK Disk object (version 4) into a vblk structure. * * Return: 'true' @vb contains a Disk VBLK * 'false' @vb contents are not defined */ static bool ldm_parse_dsk4 (const u8 *buffer, int buflen, struct vblk *vb) { int r_objid, r_name, len; struct vblk_disk *disk; BUG_ON (!buffer || !vb); r_objid = ldm_relative (buffer, buflen, 0x18, 0); r_name = ldm_relative (buffer, buflen, 0x18, r_objid); len = r_name; if (len < 0) return false; len += VBLK_SIZE_DSK4; if (len != get_unaligned_be32(buffer + 0x14)) return false; disk = &vb->vblk.disk; import_uuid(&disk->disk_id, buffer + 0x18 + r_name); return true; } /** * ldm_parse_prt3 - Read a raw VBLK Partition object into a vblk structure * @buffer: Block of data being worked on * @buflen: Size of the block of data * @vb: In-memory vblk in which to return information * * Read a raw VBLK Partition object (version 3) into a vblk structure. * * Return: 'true' @vb contains a Partition VBLK * 'false' @vb contents are not defined */ static bool ldm_parse_prt3(const u8 *buffer, int buflen, struct vblk *vb) { int r_objid, r_name, r_size, r_parent, r_diskid, r_index, len; struct vblk_part *part; BUG_ON(!buffer || !vb); r_objid = ldm_relative(buffer, buflen, 0x18, 0); if (r_objid < 0) { ldm_error("r_objid %d < 0", r_objid); return false; } r_name = ldm_relative(buffer, buflen, 0x18, r_objid); if (r_name < 0) { ldm_error("r_name %d < 0", r_name); return false; } r_size = ldm_relative(buffer, buflen, 0x34, r_name); if (r_size < 0) { ldm_error("r_size %d < 0", r_size); return false; } r_parent = ldm_relative(buffer, buflen, 0x34, r_size); if (r_parent < 0) { ldm_error("r_parent %d < 0", r_parent); return false; } r_diskid = ldm_relative(buffer, buflen, 0x34, r_parent); if (r_diskid < 0) { ldm_error("r_diskid %d < 0", r_diskid); return false; } if (buffer[0x12] & VBLK_FLAG_PART_INDEX) { r_index = ldm_relative(buffer, buflen, 0x34, r_diskid); if (r_index < 0) { ldm_error("r_index %d < 0", r_index); return false; } len = r_index; } else len = r_diskid; if (len < 0) { ldm_error("len %d < 0", len); return false; } len += VBLK_SIZE_PRT3; if (len > get_unaligned_be32(buffer + 0x14)) { ldm_error("len %d > BE32(buffer + 0x14) %d", len, get_unaligned_be32(buffer + 0x14)); return false; } part = &vb->vblk.part; part->start = get_unaligned_be64(buffer + 0x24 + r_name); part->volume_offset = get_unaligned_be64(buffer + 0x2C + r_name); part->size = ldm_get_vnum(buffer + 0x34 + r_name); part->parent_id = ldm_get_vnum(buffer + 0x34 + r_size); part->disk_id = ldm_get_vnum(buffer + 0x34 + r_parent); if (vb->flags & VBLK_FLAG_PART_INDEX) part->partnum = buffer[0x35 + r_diskid]; else part->partnum = 0; return true; } /** * ldm_parse_vol5 - Read a raw VBLK Volume object into a vblk structure * @buffer: Block of data being worked on * @buflen: Size of the block of data * @vb: In-memory vblk in which to return information * * Read a raw VBLK Volume object (version 5) into a vblk structure. * * Return: 'true' @vb contains a Volume VBLK * 'false' @vb contents are not defined */ static bool ldm_parse_vol5(const u8 *buffer, int buflen, struct vblk *vb) { int r_objid, r_name, r_vtype, r_disable_drive_letter, r_child, r_size; int r_id1, r_id2, r_size2, r_drive, len; struct vblk_volu *volu; BUG_ON(!buffer || !vb); r_objid = ldm_relative(buffer, buflen, 0x18, 0); if (r_objid < 0) { ldm_error("r_objid %d < 0", r_objid); return false; } r_name = ldm_relative(buffer, buflen, 0x18, r_objid); if (r_name < 0) { ldm_error("r_name %d < 0", r_name); return false; } r_vtype = ldm_relative(buffer, buflen, 0x18, r_name); if (r_vtype < 0) { ldm_error("r_vtype %d < 0", r_vtype); return false; } r_disable_drive_letter = ldm_relative(buffer, buflen, 0x18, r_vtype); if (r_disable_drive_letter < 0) { ldm_error("r_disable_drive_letter %d < 0", r_disable_drive_letter); return false; } r_child = ldm_relative(buffer, buflen, 0x2D, r_disable_drive_letter); if (r_child < 0) { ldm_error("r_child %d < 0", r_child); return false; } r_size = ldm_relative(buffer, buflen, 0x3D, r_child); if (r_size < 0) { ldm_error("r_size %d < 0", r_size); return false; } if (buffer[0x12] & VBLK_FLAG_VOLU_ID1) { r_id1 = ldm_relative(buffer, buflen, 0x52, r_size); if (r_id1 < 0) { ldm_error("r_id1 %d < 0", r_id1); return false; } } else r_id1 = r_size; if (buffer[0x12] & VBLK_FLAG_VOLU_ID2) { r_id2 = ldm_relative(buffer, buflen, 0x52, r_id1); if (r_id2 < 0) { ldm_error("r_id2 %d < 0", r_id2); return false; } } else r_id2 = r_id1; if (buffer[0x12] & VBLK_FLAG_VOLU_SIZE) { r_size2 = ldm_relative(buffer, buflen, 0x52, r_id2); if (r_size2 < 0) { ldm_error("r_size2 %d < 0", r_size2); return false; } } else r_size2 = r_id2; if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) { r_drive = ldm_relative(buffer, buflen, 0x52, r_size2); if (r_drive < 0) { ldm_error("r_drive %d < 0", r_drive); return false; } } else r_drive = r_size2; len = r_drive; if (len < 0) { ldm_error("len %d < 0", len); return false; } len += VBLK_SIZE_VOL5; if (len > get_unaligned_be32(buffer + 0x14)) { ldm_error("len %d > BE32(buffer + 0x14) %d", len, get_unaligned_be32(buffer + 0x14)); return false; } volu = &vb->vblk.volu; ldm_get_vstr(buffer + 0x18 + r_name, volu->volume_type, sizeof(volu->volume_type)); memcpy(volu->volume_state, buffer + 0x18 + r_disable_drive_letter, sizeof(volu->volume_state)); volu->size = ldm_get_vnum(buffer + 0x3D + r_child); volu->partition_type = buffer[0x41 + r_size]; memcpy(volu->guid, buffer + 0x42 + r_size, sizeof(volu->guid)); if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) { ldm_get_vstr(buffer + 0x52 + r_size, volu->drive_hint, sizeof(volu->drive_hint)); } return true; } /** * ldm_parse_vblk - Read a raw VBLK object into a vblk structure * @buf: Block of data being worked on * @len: Size of the block of data * @vb: In-memory vblk in which to return information * * Read a raw VBLK object into a vblk structure. This function just reads the * information common to all VBLK types, then delegates the rest of the work to * helper functions: ldm_parse_*. * * Return: 'true' @vb contains a VBLK * 'false' @vb contents are not defined */ static bool ldm_parse_vblk (const u8 *buf, int len, struct vblk *vb) { bool result = false; int r_objid; BUG_ON (!buf || !vb); r_objid = ldm_relative (buf, len, 0x18, 0); if (r_objid < 0) { ldm_error ("VBLK header is corrupt."); return false; } vb->flags = buf[0x12]; vb->type = buf[0x13]; vb->obj_id = ldm_get_vnum (buf + 0x18); ldm_get_vstr (buf+0x18+r_objid, vb->name, sizeof (vb->name)); switch (vb->type) { case VBLK_CMP3: result = ldm_parse_cmp3 (buf, len, vb); break; case VBLK_DSK3: result = ldm_parse_dsk3 (buf, len, vb); break; case VBLK_DSK4: result = ldm_parse_dsk4 (buf, len, vb); break; case VBLK_DGR3: result = ldm_parse_dgr3 (buf, len, vb); break; case VBLK_DGR4: result = ldm_parse_dgr4 (buf, len, vb); break; case VBLK_PRT3: result = ldm_parse_prt3 (buf, len, vb); break; case VBLK_VOL5: result = ldm_parse_vol5 (buf, len, vb); break; } if (result) ldm_debug ("Parsed VBLK 0x%llx (type: 0x%02x) ok.", (unsigned long long) vb->obj_id, vb->type); else ldm_error ("Failed to parse VBLK 0x%llx (type: 0x%02x).", (unsigned long long) vb->obj_id, vb->type); return result; } /** * ldm_ldmdb_add - Adds a raw VBLK entry to the ldmdb database * @data: Raw VBLK to add to the database * @len: Size of the raw VBLK * @ldb: Cache of the database structures * * The VBLKs are sorted into categories. Partitions are also sorted by offset. * * N.B. This function does not check the validity of the VBLKs. * * Return: 'true' The VBLK was added * 'false' An error occurred */ static bool ldm_ldmdb_add (u8 *data, int len, struct ldmdb *ldb) { struct vblk *vb; struct list_head *item; BUG_ON (!data || !ldb); vb = kmalloc (sizeof (*vb), GFP_KERNEL); if (!vb) { ldm_crit ("Out of memory."); return false; } if (!ldm_parse_vblk (data, len, vb)) { kfree(vb); return false; /* Already logged */ } /* Put vblk into the correct list. */ switch (vb->type) { case VBLK_DGR3: case VBLK_DGR4: list_add (&vb->list, &ldb->v_dgrp); break; case VBLK_DSK3: case VBLK_DSK4: list_add (&vb->list, &ldb->v_disk); break; case VBLK_VOL5: list_add (&vb->list, &ldb->v_volu); break; case VBLK_CMP3: list_add (&vb->list, &ldb->v_comp); break; case VBLK_PRT3: /* Sort by the partition's start sector. */ list_for_each (item, &ldb->v_part) { struct vblk *v = list_entry (item, struct vblk, list); if ((v->vblk.part.disk_id == vb->vblk.part.disk_id) && (v->vblk.part.start > vb->vblk.part.start)) { list_add_tail (&vb->list, &v->list); return true; } } list_add_tail (&vb->list, &ldb->v_part); break; } return true; } /** * ldm_frag_add - Add a VBLK fragment to a list * @data: Raw fragment to be added to the list * @size: Size of the raw fragment * @frags: Linked list of VBLK fragments * * Fragmented VBLKs may not be consecutive in the database, so they are placed * in a list so they can be pieced together later. * * Return: 'true' Success, the VBLK was added to the list * 'false' Error, a problem occurred */ static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags) { struct frag *f; struct list_head *item; int rec, num, group; BUG_ON (!data || !frags); if (size < 2 * VBLK_SIZE_HEAD) { ldm_error("Value of size is too small."); return false; } group = get_unaligned_be32(data + 0x08); rec = get_unaligned_be16(data + 0x0C); num = get_unaligned_be16(data + 0x0E); if ((num < 1) || (num > 4)) { ldm_error ("A VBLK claims to have %d parts.", num); return false; } if (rec >= num) { ldm_error("REC value (%d) exceeds NUM value (%d)", rec, num); return false; } list_for_each (item, frags) { f = list_entry (item, struct frag, list); if (f->group == group) goto found; } f = kmalloc (sizeof (*f) + size*num, GFP_KERNEL); if (!f) { ldm_crit ("Out of memory."); return false; } f->group = group; f->num = num; f->rec = rec; f->map = 0xFF << num; list_add_tail (&f->list, frags); found: if (rec >= f->num) { ldm_error("REC value (%d) exceeds NUM value (%d)", rec, f->num); return false; } if (f->map & (1 << rec)) { ldm_error ("Duplicate VBLK, part %d.", rec); f->map &= 0x7F; /* Mark the group as broken */ return false; } f->map |= (1 << rec); if (!rec) memcpy(f->data, data, VBLK_SIZE_HEAD); data += VBLK_SIZE_HEAD; size -= VBLK_SIZE_HEAD; memcpy(f->data + VBLK_SIZE_HEAD + rec * size, data, size); return true; } /** * ldm_frag_free - Free a linked list of VBLK fragments * @list: Linked list of fragments * * Free a linked list of VBLK fragments * * Return: none */ static void ldm_frag_free (struct list_head *list) { struct list_head *item, *tmp; BUG_ON (!list); list_for_each_safe (item, tmp, list) kfree (list_entry (item, struct frag, list)); } /** * ldm_frag_commit - Validate fragmented VBLKs and add them to the database * @frags: Linked list of VBLK fragments * @ldb: Cache of the database structures * * Now that all the fragmented VBLKs have been collected, they must be added to * the database for later use. * * Return: 'true' All the fragments we added successfully * 'false' One or more of the fragments we invalid */ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb) { struct frag *f; struct list_head *item; BUG_ON (!frags || !ldb); list_for_each (item, frags) { f = list_entry (item, struct frag, list); if (f->map != 0xFF) { ldm_error ("VBLK group %d is incomplete (0x%02x).", f->group, f->map); return false; } if (!ldm_ldmdb_add (f->data, f->num*ldb->vm.vblk_size, ldb)) return false; /* Already logged */ } return true; } /** * ldm_get_vblks - Read the on-disk database of VBLKs into memory * @state: Partition check state including device holding the LDM Database * @base: Offset, into @state->disk, of the database * @ldb: Cache of the database structures * * To use the information from the VBLKs, they need to be read from the disk, * unpacked and validated. We cache them in @ldb according to their type. * * Return: 'true' All the VBLKs were read successfully * 'false' An error occurred */ static bool ldm_get_vblks(struct parsed_partitions *state, unsigned long base, struct ldmdb *ldb) { int size, perbuf, skip, finish, s, v, recs; u8 *data = NULL; Sector sect; bool result = false; LIST_HEAD (frags); BUG_ON(!state || !ldb); size = ldb->vm.vblk_size; perbuf = 512 / size; skip = ldb->vm.vblk_offset >> 9; /* Bytes to sectors */ finish = (size * ldb->vm.last_vblk_seq) >> 9; for (s = skip; s < finish; s++) { /* For each sector */ data = read_part_sector(state, base + OFF_VMDB + s, §); if (!data) { ldm_crit ("Disk read failed."); goto out; } for (v = 0; v < perbuf; v++, data+=size) { /* For each vblk */ if (MAGIC_VBLK != get_unaligned_be32(data)) { ldm_error ("Expected to find a VBLK."); goto out; } recs = get_unaligned_be16(data + 0x0E); /* Number of records */ if (recs == 1) { if (!ldm_ldmdb_add (data, size, ldb)) goto out; /* Already logged */ } else if (recs > 1) { if (!ldm_frag_add (data, size, &frags)) goto out; /* Already logged */ } /* else Record is not in use, ignore it. */ } put_dev_sector (sect); data = NULL; } result = ldm_frag_commit (&frags, ldb); /* Failures, already logged */ out: if (data) put_dev_sector (sect); ldm_frag_free (&frags); return result; } /** * ldm_free_vblks - Free a linked list of vblk's * @lh: Head of a linked list of struct vblk * * Free a list of vblk's and free the memory used to maintain the list. * * Return: none */ static void ldm_free_vblks (struct list_head *lh) { struct list_head *item, *tmp; BUG_ON (!lh); list_for_each_safe (item, tmp, lh) kfree (list_entry (item, struct vblk, list)); } /** * ldm_partition - Find out whether a device is a dynamic disk and handle it * @state: Partition check state including device holding the LDM Database * * This determines whether the device @bdev is a dynamic disk and if so creates * the partitions necessary in the gendisk structure pointed to by @hd. * * We create a dummy device 1, which contains the LDM database, and then create * each partition described by the LDM database in sequence as devices 2+. For * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3, * and so on: the actual data containing partitions. * * Return: 1 Success, @state->disk is a dynamic disk and we handled it * 0 Success, @state->disk is not a dynamic disk * -1 An error occurred before enough information had been read * Or @state->disk is a dynamic disk, but it may be corrupted */ int ldm_partition(struct parsed_partitions *state) { struct ldmdb *ldb; unsigned long base; int result = -1; BUG_ON(!state); /* Look for signs of a Dynamic Disk */ if (!ldm_validate_partition_table(state)) return 0; ldb = kmalloc (sizeof (*ldb), GFP_KERNEL); if (!ldb) { ldm_crit ("Out of memory."); goto out; } /* Parse and check privheads. */ if (!ldm_validate_privheads(state, &ldb->ph)) goto out; /* Already logged */ /* All further references are relative to base (database start). */ base = ldb->ph.config_start; /* Parse and check tocs and vmdb. */ if (!ldm_validate_tocblocks(state, base, ldb) || !ldm_validate_vmdb(state, base, ldb)) goto out; /* Already logged */ /* Initialize vblk lists in ldmdb struct */ INIT_LIST_HEAD (&ldb->v_dgrp); INIT_LIST_HEAD (&ldb->v_disk); INIT_LIST_HEAD (&ldb->v_volu); INIT_LIST_HEAD (&ldb->v_comp); INIT_LIST_HEAD (&ldb->v_part); if (!ldm_get_vblks(state, base, ldb)) { ldm_crit ("Failed to read the VBLKs from the database."); goto cleanup; } /* Finally, create the data partition devices. */ if (ldm_create_data_partitions(state, ldb)) { ldm_debug ("Parsed LDM database successfully."); result = 1; } /* else Already logged */ cleanup: ldm_free_vblks (&ldb->v_dgrp); ldm_free_vblks (&ldb->v_disk); ldm_free_vblks (&ldb->v_volu); ldm_free_vblks (&ldb->v_comp); ldm_free_vblks (&ldb->v_part); out: kfree (ldb); return result; } |
| 3 9 9 109 108 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H #define _BCACHEFS_BACKPOINTERS_BACKGROUND_H #include "btree_cache.h" #include "btree_iter.h" #include "btree_update.h" #include "buckets.h" #include "error.h" #include "super.h" static inline u64 swab40(u64 x) { return (((x & 0x00000000ffULL) << 32)| ((x & 0x000000ff00ULL) << 16)| ((x & 0x0000ff0000ULL) >> 0)| ((x & 0x00ff000000ULL) >> 16)| ((x & 0xff00000000ULL) >> 32)); } int bch2_backpointer_validate(struct bch_fs *, struct bkey_s_c k, enum bch_validate_flags); void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *); void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_backpointer_swab(struct bkey_s); #define bch2_bkey_ops_backpointer ((struct bkey_ops) { \ .key_validate = bch2_backpointer_validate, \ .val_to_text = bch2_backpointer_k_to_text, \ .swab = bch2_backpointer_swab, \ .min_val_size = 32, \ }) #define MAX_EXTENT_COMPRESS_RATIO_SHIFT 10 /* * Convert from pos in backpointer btree to pos of corresponding bucket in alloc * btree: */ static inline struct bpos bp_pos_to_bucket(const struct bch_dev *ca, struct bpos bp_pos) { u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector)); } static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket) { rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu(c, bp_pos.inode); if (ca) *bucket = bp_pos_to_bucket(ca, bp_pos); rcu_read_unlock(); return ca != NULL; } static inline bool bp_pos_to_bucket_nodev(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket) { return !bch2_fs_inconsistent_on(!bp_pos_to_bucket_nodev_noerror(c, bp_pos, bucket), c, "backpointer for missing device %llu", bp_pos.inode); } static inline struct bpos bucket_pos_to_bp_noerror(const struct bch_dev *ca, struct bpos bucket, u64 bucket_offset) { return POS(bucket.inode, (bucket_to_sector(ca, bucket.offset) << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset); } /* * Convert from pos in alloc btree + bucket offset to pos in backpointer btree: */ static inline struct bpos bucket_pos_to_bp(const struct bch_dev *ca, struct bpos bucket, u64 bucket_offset) { struct bpos ret = bucket_pos_to_bp_noerror(ca, bucket, bucket_offset); EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(ca, ret))); return ret; } int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bch_dev *, struct bpos bucket, struct bch_backpointer, struct bkey_s_c, bool); static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, struct bch_dev *ca, struct bpos bucket, struct bch_backpointer bp, struct bkey_s_c orig_k, bool insert) { if (unlikely(bch2_backpointers_no_use_write_buffer)) return bch2_bucket_backpointer_mod_nowritebuffer(trans, ca, bucket, bp, orig_k, insert); struct bkey_i_backpointer bp_k; bkey_backpointer_init(&bp_k.k_i); bp_k.k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset); bp_k.v = bp; if (!insert) { bp_k.k.type = KEY_TYPE_deleted; set_bkey_val_u64s(&bp_k.k, 0); } return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k.k_i); } static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k, struct extent_ptr_decoded p, const union bch_extent_entry *entry) { switch (k.k->type) { case KEY_TYPE_btree_ptr: case KEY_TYPE_btree_ptr_v2: return BCH_DATA_btree; case KEY_TYPE_extent: case KEY_TYPE_reflink_v: return p.has_ec ? BCH_DATA_stripe : BCH_DATA_user; case KEY_TYPE_stripe: { const struct bch_extent_ptr *ptr = &entry->ptr; struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); BUG_ON(ptr < s.v->ptrs || ptr >= s.v->ptrs + s.v->nr_blocks); return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant ? BCH_DATA_parity : BCH_DATA_user; } default: BUG(); } } static inline void __bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca, enum btree_id btree_id, unsigned level, struct bkey_s_c k, struct extent_ptr_decoded p, const union bch_extent_entry *entry, struct bpos *bucket_pos, struct bch_backpointer *bp, u64 sectors) { u32 bucket_offset; *bucket_pos = PTR_BUCKET_POS_OFFSET(ca, &p.ptr, &bucket_offset); *bp = (struct bch_backpointer) { .btree_id = btree_id, .level = level, .data_type = bch2_bkey_ptr_data_type(k, p, entry), .bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset, .bucket_len = sectors, .pos = k.k->p, }; } static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca, enum btree_id btree_id, unsigned level, struct bkey_s_c k, struct extent_ptr_decoded p, const union bch_extent_entry *entry, struct bpos *bucket_pos, struct bch_backpointer *bp) { u64 sectors = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p); __bch2_extent_ptr_to_bp(c, ca, btree_id, level, k, p, entry, bucket_pos, bp, sectors); } int bch2_get_next_backpointer(struct btree_trans *, struct bch_dev *ca, struct bpos, int, struct bpos *, struct bch_backpointer *, unsigned); struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *, struct bpos, struct bch_backpointer, unsigned); struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *, struct bpos, struct bch_backpointer); int bch2_check_btree_backpointers(struct bch_fs *); int bch2_check_extents_to_backpointers(struct bch_fs *); int bch2_check_backpointers_to_extents(struct bch_fs *); #endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */ |
| 898 898 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_COMPAT_H #define _ASM_X86_COMPAT_H /* * Architecture specific compatibility types */ #include <linux/types.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <asm/processor.h> #include <asm/user32.h> #include <asm/unistd.h> #define compat_mode_t compat_mode_t typedef u16 compat_mode_t; #define __compat_uid_t __compat_uid_t typedef u16 __compat_uid_t; typedef u16 __compat_gid_t; #define compat_dev_t compat_dev_t typedef u16 compat_dev_t; #define compat_ipc_pid_t compat_ipc_pid_t typedef u16 compat_ipc_pid_t; #define compat_statfs compat_statfs #include <asm-generic/compat.h> #define COMPAT_UTS_MACHINE "i686\0\0" typedef u16 compat_nlink_t; struct compat_stat { u32 st_dev; compat_ino_t st_ino; compat_mode_t st_mode; compat_nlink_t st_nlink; __compat_uid_t st_uid; __compat_gid_t st_gid; u32 st_rdev; u32 st_size; u32 st_blksize; u32 st_blocks; u32 st_atime; u32 st_atime_nsec; u32 st_mtime; u32 st_mtime_nsec; u32 st_ctime; u32 st_ctime_nsec; u32 __unused4; u32 __unused5; }; /* * IA32 uses 4 byte alignment for 64 bit quantities, so we need to pack the * compat flock64 structure. */ #define __ARCH_NEED_COMPAT_FLOCK64_PACKED struct compat_statfs { int f_type; int f_bsize; int f_blocks; int f_bfree; int f_bavail; int f_files; int f_ffree; compat_fsid_t f_fsid; int f_namelen; /* SunOS ignores this field. */ int f_frsize; int f_flags; int f_spare[4]; }; #ifdef CONFIG_X86_X32_ABI #define COMPAT_USE_64BIT_TIME \ (!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)) #endif static inline bool in_x32_syscall(void) { #ifdef CONFIG_X86_X32_ABI if (task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT) return true; #endif return false; } static inline bool in_32bit_syscall(void) { return in_ia32_syscall() || in_x32_syscall(); } #ifdef CONFIG_COMPAT static inline bool in_compat_syscall(void) { return in_32bit_syscall(); } #define in_compat_syscall in_compat_syscall /* override the generic impl */ #define compat_need_64bit_alignment_fixup in_ia32_syscall #endif struct compat_siginfo; #ifdef CONFIG_X86_X32_ABI int copy_siginfo_to_user32(struct compat_siginfo __user *to, const kernel_siginfo_t *from); #define copy_siginfo_to_user32 copy_siginfo_to_user32 #endif /* CONFIG_X86_X32_ABI */ #endif /* _ASM_X86_COMPAT_H */ |
| 3927 3907 508 1 1 333 133 15 3685 102 3342 746 3926 476 3683 4026 4028 1 3728 4033 4022 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 | // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/mount.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include <linux/slab.h> #include <uapi/linux/mount.h> #include "common.h" /* String table for special mount operations. */ static const char * const tomoyo_mounts[TOMOYO_MAX_SPECIAL_MOUNT] = { [TOMOYO_MOUNT_BIND] = "--bind", [TOMOYO_MOUNT_MOVE] = "--move", [TOMOYO_MOUNT_REMOUNT] = "--remount", [TOMOYO_MOUNT_MAKE_UNBINDABLE] = "--make-unbindable", [TOMOYO_MOUNT_MAKE_PRIVATE] = "--make-private", [TOMOYO_MOUNT_MAKE_SLAVE] = "--make-slave", [TOMOYO_MOUNT_MAKE_SHARED] = "--make-shared", }; /** * tomoyo_audit_mount_log - Audit mount log. * * @r: Pointer to "struct tomoyo_request_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_audit_mount_log(struct tomoyo_request_info *r) { return tomoyo_supervisor(r, "file mount %s %s %s 0x%lX\n", r->param.mount.dev->name, r->param.mount.dir->name, r->param.mount.type->name, r->param.mount.flags); } /** * tomoyo_check_mount_acl - Check permission for path path path number operation. * * @r: Pointer to "struct tomoyo_request_info". * @ptr: Pointer to "struct tomoyo_acl_info". * * Returns true if granted, false otherwise. */ static bool tomoyo_check_mount_acl(struct tomoyo_request_info *r, const struct tomoyo_acl_info *ptr) { const struct tomoyo_mount_acl *acl = container_of(ptr, typeof(*acl), head); return tomoyo_compare_number_union(r->param.mount.flags, &acl->flags) && tomoyo_compare_name_union(r->param.mount.type, &acl->fs_type) && tomoyo_compare_name_union(r->param.mount.dir, &acl->dir_name) && (!r->param.mount.need_dev || tomoyo_compare_name_union(r->param.mount.dev, &acl->dev_name)); } /** * tomoyo_mount_acl - Check permission for mount() operation. * * @r: Pointer to "struct tomoyo_request_info". * @dev_name: Name of device file. Maybe NULL. * @dir: Pointer to "struct path". * @type: Name of filesystem type. * @flags: Mount options. * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_mount_acl(struct tomoyo_request_info *r, const char *dev_name, const struct path *dir, const char *type, unsigned long flags) { struct tomoyo_obj_info obj = { }; struct path path; struct file_system_type *fstype = NULL; const char *requested_type = NULL; const char *requested_dir_name = NULL; const char *requested_dev_name = NULL; struct tomoyo_path_info rtype; struct tomoyo_path_info rdev; struct tomoyo_path_info rdir; int need_dev = 0; int error = -ENOMEM; r->obj = &obj; /* Get fstype. */ requested_type = tomoyo_encode(type); if (!requested_type) goto out; rtype.name = requested_type; tomoyo_fill_path_info(&rtype); /* Get mount point. */ obj.path2 = *dir; requested_dir_name = tomoyo_realpath_from_path(dir); if (!requested_dir_name) { error = -ENOMEM; goto out; } rdir.name = requested_dir_name; tomoyo_fill_path_info(&rdir); /* Compare fs name. */ if (type == tomoyo_mounts[TOMOYO_MOUNT_REMOUNT]) { /* dev_name is ignored. */ } else if (type == tomoyo_mounts[TOMOYO_MOUNT_MAKE_UNBINDABLE] || type == tomoyo_mounts[TOMOYO_MOUNT_MAKE_PRIVATE] || type == tomoyo_mounts[TOMOYO_MOUNT_MAKE_SLAVE] || type == tomoyo_mounts[TOMOYO_MOUNT_MAKE_SHARED]) { /* dev_name is ignored. */ } else if (type == tomoyo_mounts[TOMOYO_MOUNT_BIND] || type == tomoyo_mounts[TOMOYO_MOUNT_MOVE]) { need_dev = -1; /* dev_name is a directory */ } else { fstype = get_fs_type(type); if (!fstype) { error = -ENODEV; goto out; } if (fstype->fs_flags & FS_REQUIRES_DEV) /* dev_name is a block device file. */ need_dev = 1; } if (need_dev) { /* Get mount point or device file. */ if (!dev_name || kern_path(dev_name, LOOKUP_FOLLOW, &path)) { error = -ENOENT; goto out; } obj.path1 = path; requested_dev_name = tomoyo_realpath_from_path(&path); if (!requested_dev_name) { error = -ENOENT; goto out; } } else { /* Map dev_name to "<NULL>" if no dev_name given. */ if (!dev_name) dev_name = "<NULL>"; requested_dev_name = tomoyo_encode(dev_name); if (!requested_dev_name) { error = -ENOMEM; goto out; } } rdev.name = requested_dev_name; tomoyo_fill_path_info(&rdev); r->param_type = TOMOYO_TYPE_MOUNT_ACL; r->param.mount.need_dev = need_dev; r->param.mount.dev = &rdev; r->param.mount.dir = &rdir; r->param.mount.type = &rtype; r->param.mount.flags = flags; do { tomoyo_check_acl(r, tomoyo_check_mount_acl); error = tomoyo_audit_mount_log(r); } while (error == TOMOYO_RETRY_REQUEST); out: kfree(requested_dev_name); kfree(requested_dir_name); if (fstype) put_filesystem(fstype); kfree(requested_type); /* Drop refcount obtained by kern_path(). */ if (obj.path1.dentry) path_put(&obj.path1); return error; } /** * tomoyo_mount_permission - Check permission for mount() operation. * * @dev_name: Name of device file. Maybe NULL. * @path: Pointer to "struct path". * @type: Name of filesystem type. Maybe NULL. * @flags: Mount options. * @data_page: Optional data. Maybe NULL. * * Returns 0 on success, negative value otherwise. */ int tomoyo_mount_permission(const char *dev_name, const struct path *path, const char *type, unsigned long flags, void *data_page) { struct tomoyo_request_info r; int error; int idx; if (tomoyo_init_request_info(&r, NULL, TOMOYO_MAC_FILE_MOUNT) == TOMOYO_CONFIG_DISABLED) return 0; if ((flags & MS_MGC_MSK) == MS_MGC_VAL) flags &= ~MS_MGC_MSK; if (flags & MS_REMOUNT) { type = tomoyo_mounts[TOMOYO_MOUNT_REMOUNT]; flags &= ~MS_REMOUNT; } else if (flags & MS_BIND) { type = tomoyo_mounts[TOMOYO_MOUNT_BIND]; flags &= ~MS_BIND; } else if (flags & MS_SHARED) { if (flags & (MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) return -EINVAL; type = tomoyo_mounts[TOMOYO_MOUNT_MAKE_SHARED]; flags &= ~MS_SHARED; } else if (flags & MS_PRIVATE) { if (flags & (MS_SHARED | MS_SLAVE | MS_UNBINDABLE)) return -EINVAL; type = tomoyo_mounts[TOMOYO_MOUNT_MAKE_PRIVATE]; flags &= ~MS_PRIVATE; } else if (flags & MS_SLAVE) { if (flags & (MS_SHARED | MS_PRIVATE | MS_UNBINDABLE)) return -EINVAL; type = tomoyo_mounts[TOMOYO_MOUNT_MAKE_SLAVE]; flags &= ~MS_SLAVE; } else if (flags & MS_UNBINDABLE) { if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE)) return -EINVAL; type = tomoyo_mounts[TOMOYO_MOUNT_MAKE_UNBINDABLE]; flags &= ~MS_UNBINDABLE; } else if (flags & MS_MOVE) { type = tomoyo_mounts[TOMOYO_MOUNT_MOVE]; flags &= ~MS_MOVE; } if (!type) type = "<NULL>"; idx = tomoyo_read_lock(); error = tomoyo_mount_acl(&r, dev_name, path, type, flags); tomoyo_read_unlock(idx); return error; } |
| 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 | // SPDX-License-Identifier: GPL-2.0-only // Copyright (c) 2020 Facebook Inc. #include <linux/ethtool_netlink.h> #include <linux/netdevice.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/workqueue.h> #include <net/udp_tunnel.h> #include <net/vxlan.h> enum udp_tunnel_nic_table_entry_flags { UDP_TUNNEL_NIC_ENTRY_ADD = BIT(0), UDP_TUNNEL_NIC_ENTRY_DEL = BIT(1), UDP_TUNNEL_NIC_ENTRY_OP_FAIL = BIT(2), UDP_TUNNEL_NIC_ENTRY_FROZEN = BIT(3), }; struct udp_tunnel_nic_table_entry { __be16 port; u8 type; u8 flags; u16 use_cnt; #define UDP_TUNNEL_NIC_USE_CNT_MAX U16_MAX u8 hw_priv; }; /** * struct udp_tunnel_nic - UDP tunnel port offload state * @work: async work for talking to hardware from process context * @dev: netdev pointer * @need_sync: at least one port start changed * @need_replay: space was freed, we need a replay of all ports * @work_pending: @work is currently scheduled * @n_tables: number of tables under @entries * @missed: bitmap of tables which overflown * @entries: table of tables of ports currently offloaded */ struct udp_tunnel_nic { struct work_struct work; struct net_device *dev; u8 need_sync:1; u8 need_replay:1; u8 work_pending:1; unsigned int n_tables; unsigned long missed; struct udp_tunnel_nic_table_entry *entries[] __counted_by(n_tables); }; /* We ensure all work structs are done using driver state, but not the code. * We need a workqueue we can flush before module gets removed. */ static struct workqueue_struct *udp_tunnel_nic_workqueue; static const char *udp_tunnel_nic_tunnel_type_name(unsigned int type) { switch (type) { case UDP_TUNNEL_TYPE_VXLAN: return "vxlan"; case UDP_TUNNEL_TYPE_GENEVE: return "geneve"; case UDP_TUNNEL_TYPE_VXLAN_GPE: return "vxlan-gpe"; default: return "unknown"; } } static bool udp_tunnel_nic_entry_is_free(struct udp_tunnel_nic_table_entry *entry) { return entry->use_cnt == 0 && !entry->flags; } static bool udp_tunnel_nic_entry_is_present(struct udp_tunnel_nic_table_entry *entry) { return entry->use_cnt && !(entry->flags & ~UDP_TUNNEL_NIC_ENTRY_FROZEN); } static bool udp_tunnel_nic_entry_is_frozen(struct udp_tunnel_nic_table_entry *entry) { return entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN; } static void udp_tunnel_nic_entry_freeze_used(struct udp_tunnel_nic_table_entry *entry) { if (!udp_tunnel_nic_entry_is_free(entry)) entry->flags |= UDP_TUNNEL_NIC_ENTRY_FROZEN; } static void udp_tunnel_nic_entry_unfreeze(struct udp_tunnel_nic_table_entry *entry) { entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_FROZEN; } static bool udp_tunnel_nic_entry_is_queued(struct udp_tunnel_nic_table_entry *entry) { return entry->flags & (UDP_TUNNEL_NIC_ENTRY_ADD | UDP_TUNNEL_NIC_ENTRY_DEL); } static void udp_tunnel_nic_entry_queue(struct udp_tunnel_nic *utn, struct udp_tunnel_nic_table_entry *entry, unsigned int flag) { entry->flags |= flag; utn->need_sync = 1; } static void udp_tunnel_nic_ti_from_entry(struct udp_tunnel_nic_table_entry *entry, struct udp_tunnel_info *ti) { memset(ti, 0, sizeof(*ti)); ti->port = entry->port; ti->type = entry->type; ti->hw_priv = entry->hw_priv; } static bool udp_tunnel_nic_is_empty(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) if (!udp_tunnel_nic_entry_is_free(&utn->entries[i][j])) return false; return true; } static bool udp_tunnel_nic_should_replay(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_table_info *table; unsigned int i, j; if (!utn->missed) return false; for (i = 0; i < utn->n_tables; i++) { table = &dev->udp_tunnel_nic_info->tables[i]; if (!test_bit(i, &utn->missed)) continue; for (j = 0; j < table->n_entries; j++) if (udp_tunnel_nic_entry_is_free(&utn->entries[i][j])) return true; } return false; } static void __udp_tunnel_nic_get_port(struct net_device *dev, unsigned int table, unsigned int idx, struct udp_tunnel_info *ti) { struct udp_tunnel_nic_table_entry *entry; struct udp_tunnel_nic *utn; utn = dev->udp_tunnel_nic; entry = &utn->entries[table][idx]; if (entry->use_cnt) udp_tunnel_nic_ti_from_entry(entry, ti); } static void __udp_tunnel_nic_set_port_priv(struct net_device *dev, unsigned int table, unsigned int idx, u8 priv) { dev->udp_tunnel_nic->entries[table][idx].hw_priv = priv; } static void udp_tunnel_nic_entry_update_done(struct udp_tunnel_nic_table_entry *entry, int err) { bool dodgy = entry->flags & UDP_TUNNEL_NIC_ENTRY_OP_FAIL; WARN_ON_ONCE(entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD && entry->flags & UDP_TUNNEL_NIC_ENTRY_DEL); if (entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD && (!err || (err == -EEXIST && dodgy))) entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_ADD; if (entry->flags & UDP_TUNNEL_NIC_ENTRY_DEL && (!err || (err == -ENOENT && dodgy))) entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_DEL; if (!err) entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_OP_FAIL; else entry->flags |= UDP_TUNNEL_NIC_ENTRY_OP_FAIL; } static void udp_tunnel_nic_device_sync_one(struct net_device *dev, struct udp_tunnel_nic *utn, unsigned int table, unsigned int idx) { struct udp_tunnel_nic_table_entry *entry; struct udp_tunnel_info ti; int err; entry = &utn->entries[table][idx]; if (!udp_tunnel_nic_entry_is_queued(entry)) return; udp_tunnel_nic_ti_from_entry(entry, &ti); if (entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD) err = dev->udp_tunnel_nic_info->set_port(dev, table, idx, &ti); else err = dev->udp_tunnel_nic_info->unset_port(dev, table, idx, &ti); udp_tunnel_nic_entry_update_done(entry, err); if (err) netdev_warn(dev, "UDP tunnel port sync failed port %d type %s: %d\n", be16_to_cpu(entry->port), udp_tunnel_nic_tunnel_type_name(entry->type), err); } static void udp_tunnel_nic_device_sync_by_port(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) udp_tunnel_nic_device_sync_one(dev, utn, i, j); } static void udp_tunnel_nic_device_sync_by_table(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i, j; int err; for (i = 0; i < utn->n_tables; i++) { /* Find something that needs sync in this table */ for (j = 0; j < info->tables[i].n_entries; j++) if (udp_tunnel_nic_entry_is_queued(&utn->entries[i][j])) break; if (j == info->tables[i].n_entries) continue; err = info->sync_table(dev, i); if (err) netdev_warn(dev, "UDP tunnel port sync failed for table %d: %d\n", i, err); for (j = 0; j < info->tables[i].n_entries; j++) { struct udp_tunnel_nic_table_entry *entry; entry = &utn->entries[i][j]; if (udp_tunnel_nic_entry_is_queued(entry)) udp_tunnel_nic_entry_update_done(entry, err); } } } static void __udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn) { if (!utn->need_sync) return; if (dev->udp_tunnel_nic_info->sync_table) udp_tunnel_nic_device_sync_by_table(dev, utn); else udp_tunnel_nic_device_sync_by_port(dev, utn); utn->need_sync = 0; /* Can't replay directly here, in case we come from the tunnel driver's * notification - trying to replay may deadlock inside tunnel driver. */ utn->need_replay = udp_tunnel_nic_should_replay(dev, utn); } static void udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; bool may_sleep; if (!utn->need_sync) return; /* Drivers which sleep in the callback need to update from * the workqueue, if we come from the tunnel driver's notification. */ may_sleep = info->flags & UDP_TUNNEL_NIC_INFO_MAY_SLEEP; if (!may_sleep) __udp_tunnel_nic_device_sync(dev, utn); if (may_sleep || utn->need_replay) { queue_work(udp_tunnel_nic_workqueue, &utn->work); utn->work_pending = 1; } } static bool udp_tunnel_nic_table_is_capable(const struct udp_tunnel_nic_table_info *table, struct udp_tunnel_info *ti) { return table->tunnel_types & ti->type; } static bool udp_tunnel_nic_is_capable(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i; /* Special case IPv4-only NICs */ if (info->flags & UDP_TUNNEL_NIC_INFO_IPV4_ONLY && ti->sa_family != AF_INET) return false; for (i = 0; i < utn->n_tables; i++) if (udp_tunnel_nic_table_is_capable(&info->tables[i], ti)) return true; return false; } static int udp_tunnel_nic_has_collision(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic_table_entry *entry; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) { entry = &utn->entries[i][j]; if (!udp_tunnel_nic_entry_is_free(entry) && entry->port == ti->port && entry->type != ti->type) { __set_bit(i, &utn->missed); return true; } } return false; } static void udp_tunnel_nic_entry_adj(struct udp_tunnel_nic *utn, unsigned int table, unsigned int idx, int use_cnt_adj) { struct udp_tunnel_nic_table_entry *entry = &utn->entries[table][idx]; bool dodgy = entry->flags & UDP_TUNNEL_NIC_ENTRY_OP_FAIL; unsigned int from, to; WARN_ON(entry->use_cnt + (u32)use_cnt_adj > U16_MAX); /* If not going from used to unused or vice versa - all done. * For dodgy entries make sure we try to sync again (queue the entry). */ entry->use_cnt += use_cnt_adj; if (!dodgy && !entry->use_cnt == !(entry->use_cnt - use_cnt_adj)) return; /* Cancel the op before it was sent to the device, if possible, * otherwise we'd need to take special care to issue commands * in the same order the ports arrived. */ if (use_cnt_adj < 0) { from = UDP_TUNNEL_NIC_ENTRY_ADD; to = UDP_TUNNEL_NIC_ENTRY_DEL; } else { from = UDP_TUNNEL_NIC_ENTRY_DEL; to = UDP_TUNNEL_NIC_ENTRY_ADD; } if (entry->flags & from) { entry->flags &= ~from; if (!dodgy) return; } udp_tunnel_nic_entry_queue(utn, entry, to); } static bool udp_tunnel_nic_entry_try_adj(struct udp_tunnel_nic *utn, unsigned int table, unsigned int idx, struct udp_tunnel_info *ti, int use_cnt_adj) { struct udp_tunnel_nic_table_entry *entry = &utn->entries[table][idx]; if (udp_tunnel_nic_entry_is_free(entry) || entry->port != ti->port || entry->type != ti->type) return false; if (udp_tunnel_nic_entry_is_frozen(entry)) return true; udp_tunnel_nic_entry_adj(utn, table, idx, use_cnt_adj); return true; } /* Try to find existing matching entry and adjust its use count, instead of * adding a new one. Returns true if entry was found. In case of delete the * entry may have gotten removed in the process, in which case it will be * queued for removal. */ static bool udp_tunnel_nic_try_existing(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti, int use_cnt_adj) { const struct udp_tunnel_nic_table_info *table; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) { table = &dev->udp_tunnel_nic_info->tables[i]; if (!udp_tunnel_nic_table_is_capable(table, ti)) continue; for (j = 0; j < table->n_entries; j++) if (udp_tunnel_nic_entry_try_adj(utn, i, j, ti, use_cnt_adj)) return true; } return false; } static bool udp_tunnel_nic_add_existing(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { return udp_tunnel_nic_try_existing(dev, utn, ti, +1); } static bool udp_tunnel_nic_del_existing(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { return udp_tunnel_nic_try_existing(dev, utn, ti, -1); } static bool udp_tunnel_nic_add_new(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { const struct udp_tunnel_nic_table_info *table; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) { table = &dev->udp_tunnel_nic_info->tables[i]; if (!udp_tunnel_nic_table_is_capable(table, ti)) continue; for (j = 0; j < table->n_entries; j++) { struct udp_tunnel_nic_table_entry *entry; entry = &utn->entries[i][j]; if (!udp_tunnel_nic_entry_is_free(entry)) continue; entry->port = ti->port; entry->type = ti->type; entry->use_cnt = 1; udp_tunnel_nic_entry_queue(utn, entry, UDP_TUNNEL_NIC_ENTRY_ADD); return true; } /* The different table may still fit this port in, but there * are no devices currently which have multiple tables accepting * the same tunnel type, and false positives are okay. */ __set_bit(i, &utn->missed); } return false; } static void __udp_tunnel_nic_add_port(struct net_device *dev, struct udp_tunnel_info *ti) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic *utn; utn = dev->udp_tunnel_nic; if (!utn) return; if (!netif_running(dev) && info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY) return; if (info->flags & UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN && ti->port == htons(IANA_VXLAN_UDP_PORT)) { if (ti->type != UDP_TUNNEL_TYPE_VXLAN) netdev_warn(dev, "device assumes port 4789 will be used by vxlan tunnels\n"); return; } if (!udp_tunnel_nic_is_capable(dev, utn, ti)) return; /* It may happen that a tunnel of one type is removed and different * tunnel type tries to reuse its port before the device was informed. * Rely on utn->missed to re-add this port later. */ if (udp_tunnel_nic_has_collision(dev, utn, ti)) return; if (!udp_tunnel_nic_add_existing(dev, utn, ti)) udp_tunnel_nic_add_new(dev, utn, ti); udp_tunnel_nic_device_sync(dev, utn); } static void __udp_tunnel_nic_del_port(struct net_device *dev, struct udp_tunnel_info *ti) { struct udp_tunnel_nic *utn; utn = dev->udp_tunnel_nic; if (!utn) return; if (!udp_tunnel_nic_is_capable(dev, utn, ti)) return; udp_tunnel_nic_del_existing(dev, utn, ti); udp_tunnel_nic_device_sync(dev, utn); } static void __udp_tunnel_nic_reset_ntf(struct net_device *dev) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic *utn; unsigned int i, j; ASSERT_RTNL(); utn = dev->udp_tunnel_nic; if (!utn) return; utn->need_sync = false; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) { struct udp_tunnel_nic_table_entry *entry; entry = &utn->entries[i][j]; entry->flags &= ~(UDP_TUNNEL_NIC_ENTRY_DEL | UDP_TUNNEL_NIC_ENTRY_OP_FAIL); /* We don't release rtnl across ops */ WARN_ON(entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN); if (!entry->use_cnt) continue; udp_tunnel_nic_entry_queue(utn, entry, UDP_TUNNEL_NIC_ENTRY_ADD); } __udp_tunnel_nic_device_sync(dev, utn); } static size_t __udp_tunnel_nic_dump_size(struct net_device *dev, unsigned int table) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic *utn; unsigned int j; size_t size; utn = dev->udp_tunnel_nic; if (!utn) return 0; size = 0; for (j = 0; j < info->tables[table].n_entries; j++) { if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j])) continue; size += nla_total_size(0) + /* _TABLE_ENTRY */ nla_total_size(sizeof(__be16)) + /* _ENTRY_PORT */ nla_total_size(sizeof(u32)); /* _ENTRY_TYPE */ } return size; } static int __udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table, struct sk_buff *skb) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic *utn; struct nlattr *nest; unsigned int j; utn = dev->udp_tunnel_nic; if (!utn) return 0; for (j = 0; j < info->tables[table].n_entries; j++) { if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j])) continue; nest = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY); if (!nest) return -EMSGSIZE; if (nla_put_be16(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT, utn->entries[table][j].port) || nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE, ilog2(utn->entries[table][j].type))) goto err_cancel; nla_nest_end(skb, nest); } return 0; err_cancel: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static const struct udp_tunnel_nic_ops __udp_tunnel_nic_ops = { .get_port = __udp_tunnel_nic_get_port, .set_port_priv = __udp_tunnel_nic_set_port_priv, .add_port = __udp_tunnel_nic_add_port, .del_port = __udp_tunnel_nic_del_port, .reset_ntf = __udp_tunnel_nic_reset_ntf, .dump_size = __udp_tunnel_nic_dump_size, .dump_write = __udp_tunnel_nic_dump_write, }; static void udp_tunnel_nic_flush(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) { int adj_cnt = -utn->entries[i][j].use_cnt; if (adj_cnt) udp_tunnel_nic_entry_adj(utn, i, j, adj_cnt); } __udp_tunnel_nic_device_sync(dev, utn); for (i = 0; i < utn->n_tables; i++) memset(utn->entries[i], 0, array_size(info->tables[i].n_entries, sizeof(**utn->entries))); WARN_ON(utn->need_sync); utn->need_replay = 0; } static void udp_tunnel_nic_replay(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic_shared_node *node; unsigned int i, j; /* Freeze all the ports we are already tracking so that the replay * does not double up the refcount. */ for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) udp_tunnel_nic_entry_freeze_used(&utn->entries[i][j]); utn->missed = 0; utn->need_replay = 0; if (!info->shared) { udp_tunnel_get_rx_info(dev); } else { list_for_each_entry(node, &info->shared->devices, list) udp_tunnel_get_rx_info(node->dev); } for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) udp_tunnel_nic_entry_unfreeze(&utn->entries[i][j]); } static void udp_tunnel_nic_device_sync_work(struct work_struct *work) { struct udp_tunnel_nic *utn = container_of(work, struct udp_tunnel_nic, work); rtnl_lock(); utn->work_pending = 0; __udp_tunnel_nic_device_sync(utn->dev, utn); if (utn->need_replay) udp_tunnel_nic_replay(utn->dev, utn); rtnl_unlock(); } static struct udp_tunnel_nic * udp_tunnel_nic_alloc(const struct udp_tunnel_nic_info *info, unsigned int n_tables) { struct udp_tunnel_nic *utn; unsigned int i; utn = kzalloc(struct_size(utn, entries, n_tables), GFP_KERNEL); if (!utn) return NULL; utn->n_tables = n_tables; INIT_WORK(&utn->work, udp_tunnel_nic_device_sync_work); for (i = 0; i < n_tables; i++) { utn->entries[i] = kcalloc(info->tables[i].n_entries, sizeof(*utn->entries[i]), GFP_KERNEL); if (!utn->entries[i]) goto err_free_prev_entries; } return utn; err_free_prev_entries: while (i--) kfree(utn->entries[i]); kfree(utn); return NULL; } static void udp_tunnel_nic_free(struct udp_tunnel_nic *utn) { unsigned int i; for (i = 0; i < utn->n_tables; i++) kfree(utn->entries[i]); kfree(utn); } static int udp_tunnel_nic_register(struct net_device *dev) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic_shared_node *node = NULL; struct udp_tunnel_nic *utn; unsigned int n_tables, i; BUILD_BUG_ON(sizeof(utn->missed) * BITS_PER_BYTE < UDP_TUNNEL_NIC_MAX_TABLES); /* Expect use count of at most 2 (IPv4, IPv6) per device */ BUILD_BUG_ON(UDP_TUNNEL_NIC_USE_CNT_MAX < UDP_TUNNEL_NIC_MAX_SHARING_DEVICES * 2); /* Check that the driver info is sane */ if (WARN_ON(!info->set_port != !info->unset_port) || WARN_ON(!info->set_port == !info->sync_table) || WARN_ON(!info->tables[0].n_entries)) return -EINVAL; if (WARN_ON(info->shared && info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) return -EINVAL; n_tables = 1; for (i = 1; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) { if (!info->tables[i].n_entries) continue; n_tables++; if (WARN_ON(!info->tables[i - 1].n_entries)) return -EINVAL; } /* Create UDP tunnel state structures */ if (info->shared) { node = kzalloc(sizeof(*node), GFP_KERNEL); if (!node) return -ENOMEM; node->dev = dev; } if (info->shared && info->shared->udp_tunnel_nic_info) { utn = info->shared->udp_tunnel_nic_info; } else { utn = udp_tunnel_nic_alloc(info, n_tables); if (!utn) { kfree(node); return -ENOMEM; } } if (info->shared) { if (!info->shared->udp_tunnel_nic_info) { INIT_LIST_HEAD(&info->shared->devices); info->shared->udp_tunnel_nic_info = utn; } list_add_tail(&node->list, &info->shared->devices); } utn->dev = dev; dev_hold(dev); dev->udp_tunnel_nic = utn; if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) udp_tunnel_get_rx_info(dev); return 0; } static void udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; /* For a shared table remove this dev from the list of sharing devices * and if there are other devices just detach. */ if (info->shared) { struct udp_tunnel_nic_shared_node *node, *first; list_for_each_entry(node, &info->shared->devices, list) if (node->dev == dev) break; if (list_entry_is_head(node, &info->shared->devices, list)) return; list_del(&node->list); kfree(node); first = list_first_entry_or_null(&info->shared->devices, typeof(*first), list); if (first) { udp_tunnel_drop_rx_info(dev); utn->dev = first->dev; goto release_dev; } info->shared->udp_tunnel_nic_info = NULL; } /* Flush before we check work, so we don't waste time adding entries * from the work which we will boot immediately. */ udp_tunnel_nic_flush(dev, utn); /* Wait for the work to be done using the state, netdev core will * retry unregister until we give up our reference on this device. */ if (utn->work_pending) return; udp_tunnel_nic_free(utn); release_dev: dev->udp_tunnel_nic = NULL; dev_put(dev); } static int udp_tunnel_nic_netdevice_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); const struct udp_tunnel_nic_info *info; struct udp_tunnel_nic *utn; info = dev->udp_tunnel_nic_info; if (!info) return NOTIFY_DONE; if (event == NETDEV_REGISTER) { int err; err = udp_tunnel_nic_register(dev); if (err) netdev_WARN(dev, "failed to register for UDP tunnel offloads: %d", err); return notifier_from_errno(err); } /* All other events will need the udp_tunnel_nic state */ utn = dev->udp_tunnel_nic; if (!utn) return NOTIFY_DONE; if (event == NETDEV_UNREGISTER) { udp_tunnel_nic_unregister(dev, utn); return NOTIFY_OK; } /* All other events only matter if NIC has to be programmed open */ if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) return NOTIFY_DONE; if (event == NETDEV_UP) { WARN_ON(!udp_tunnel_nic_is_empty(dev, utn)); udp_tunnel_get_rx_info(dev); return NOTIFY_OK; } if (event == NETDEV_GOING_DOWN) { udp_tunnel_nic_flush(dev, utn); return NOTIFY_OK; } return NOTIFY_DONE; } static struct notifier_block udp_tunnel_nic_notifier_block __read_mostly = { .notifier_call = udp_tunnel_nic_netdevice_event, }; static int __init udp_tunnel_nic_init_module(void) { int err; udp_tunnel_nic_workqueue = alloc_ordered_workqueue("udp_tunnel_nic", 0); if (!udp_tunnel_nic_workqueue) return -ENOMEM; rtnl_lock(); udp_tunnel_nic_ops = &__udp_tunnel_nic_ops; rtnl_unlock(); err = register_netdevice_notifier(&udp_tunnel_nic_notifier_block); if (err) goto err_unset_ops; return 0; err_unset_ops: rtnl_lock(); udp_tunnel_nic_ops = NULL; rtnl_unlock(); destroy_workqueue(udp_tunnel_nic_workqueue); return err; } late_initcall(udp_tunnel_nic_init_module); static void __exit udp_tunnel_nic_cleanup_module(void) { unregister_netdevice_notifier(&udp_tunnel_nic_notifier_block); rtnl_lock(); udp_tunnel_nic_ops = NULL; rtnl_unlock(); destroy_workqueue(udp_tunnel_nic_workqueue); } module_exit(udp_tunnel_nic_cleanup_module); MODULE_LICENSE("GPL"); |
| 2094 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | /* SPDX-License-Identifier: GPL-2.0 */ /* * x86 KFENCE support. * * Copyright (C) 2020, Google LLC. */ #ifndef _ASM_X86_KFENCE_H #define _ASM_X86_KFENCE_H #ifndef MODULE #include <linux/bug.h> #include <linux/kfence.h> #include <asm/pgalloc.h> #include <asm/pgtable.h> #include <asm/set_memory.h> #include <asm/tlbflush.h> /* Force 4K pages for __kfence_pool. */ static inline bool arch_kfence_init_pool(void) { unsigned long addr; for (addr = (unsigned long)__kfence_pool; is_kfence_address((void *)addr); addr += PAGE_SIZE) { unsigned int level; if (!lookup_address(addr, &level)) return false; if (level != PG_LEVEL_4K) set_memory_4k(addr, 1); } return true; } /* Protect the given page and flush TLB. */ static inline bool kfence_protect_page(unsigned long addr, bool protect) { unsigned int level; pte_t *pte = lookup_address(addr, &level); if (WARN_ON(!pte || level != PG_LEVEL_4K)) return false; /* * We need to avoid IPIs, as we may get KFENCE allocations or faults * with interrupts disabled. Therefore, the below is best-effort, and * does not flush TLBs on all CPUs. We can tolerate some inaccuracy; * lazy fault handling takes care of faults after the page is PRESENT. */ if (protect) set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); else set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); /* * Flush this CPU's TLB, assuming whoever did the allocation/free is * likely to continue running on this CPU. */ preempt_disable(); flush_tlb_one_kernel(addr); preempt_enable(); return true; } #endif /* !MODULE */ #endif /* _ASM_X86_KFENCE_H */ |
| 1 3 1 15 15 11 7 11 1 11 11 1 10 10 6 11 11 1 11 11 11 5 1 1 4 6 6 6 6 6 2 6 4 6 2 6 10 6 6 6 6 6 4 1 1 1 1 1 9 9 2 9 8 1 9 9 9 9 9 9 9 9 9 9 9 9 9 8 9 9 9 9 9 5 9 9 9 9 9 5 5 5 5 5 9 2 8 5 4 5 5 5 5 5 4 4 4 4 4 4 4 4 4 3 3 2 2 1 1 2 2 1 2 2 2 2 5 5 2 3 1 3 5 5 5 4 1 10 2 3 1 6 1 2 2 2 3 6 7 6 7 6 6 4 3 4 1 2 2 5 5 5 5 1 1 1 1 1 1 1 11 2 1 1 8 8 6 8 8 1 6 3 3 1 1 1 1 1 1 1 1 1 2 2 2 1 7 2 1 1 1 1 1 1 1 1 1 2 7 7 5 5 5 5 5 5 5 5 5 5 5 4 5 5 5 5 5 4 1 1 5 5 5 5 5 5 5 5 5 1 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 | // SPDX-License-Identifier: GPL-2.0 #include <linux/pagewalk.h> #include <linux/mm_inline.h> #include <linux/hugetlb.h> #include <linux/huge_mm.h> #include <linux/mount.h> #include <linux/ksm.h> #include <linux/seq_file.h> #include <linux/highmem.h> #include <linux/ptrace.h> #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/mempolicy.h> #include <linux/rmap.h> #include <linux/swap.h> #include <linux/sched/mm.h> #include <linux/swapops.h> #include <linux/mmu_notifier.h> #include <linux/page_idle.h> #include <linux/shmem_fs.h> #include <linux/uaccess.h> #include <linux/pkeys.h> #include <linux/minmax.h> #include <linux/overflow.h> #include <linux/buildid.h> #include <asm/elf.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include "internal.h" #define SEQ_PUT_DEC(str, val) \ seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8) void task_mem(struct seq_file *m, struct mm_struct *mm) { unsigned long text, lib, swap, anon, file, shmem; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; anon = get_mm_counter(mm, MM_ANONPAGES); file = get_mm_counter(mm, MM_FILEPAGES); shmem = get_mm_counter(mm, MM_SHMEMPAGES); /* * Note: to minimize their overhead, mm maintains hiwater_vm and * hiwater_rss only when about to *lower* total_vm or rss. Any * collector of these hiwater stats must therefore get total_vm * and rss too, which will usually be the higher. Barriers? not * worth the effort, such snapshots can always be inconsistent. */ hiwater_vm = total_vm = mm->total_vm; if (hiwater_vm < mm->hiwater_vm) hiwater_vm = mm->hiwater_vm; hiwater_rss = total_rss = anon + file + shmem; if (hiwater_rss < mm->hiwater_rss) hiwater_rss = mm->hiwater_rss; /* split executable areas between text and lib */ text = PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK); text = min(text, mm->exec_vm << PAGE_SHIFT); lib = (mm->exec_vm << PAGE_SHIFT) - text; swap = get_mm_counter(mm, MM_SWAPENTS); SEQ_PUT_DEC("VmPeak:\t", hiwater_vm); SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm); SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm); SEQ_PUT_DEC(" kB\nVmPin:\t", atomic64_read(&mm->pinned_vm)); SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss); SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss); SEQ_PUT_DEC(" kB\nRssAnon:\t", anon); SEQ_PUT_DEC(" kB\nRssFile:\t", file); SEQ_PUT_DEC(" kB\nRssShmem:\t", shmem); SEQ_PUT_DEC(" kB\nVmData:\t", mm->data_vm); SEQ_PUT_DEC(" kB\nVmStk:\t", mm->stack_vm); seq_put_decimal_ull_width(m, " kB\nVmExe:\t", text >> 10, 8); seq_put_decimal_ull_width(m, " kB\nVmLib:\t", lib >> 10, 8); seq_put_decimal_ull_width(m, " kB\nVmPTE:\t", mm_pgtables_bytes(mm) >> 10, 8); SEQ_PUT_DEC(" kB\nVmSwap:\t", swap); seq_puts(m, " kB\n"); hugetlb_report_usage(m, mm); } #undef SEQ_PUT_DEC unsigned long task_vsize(struct mm_struct *mm) { return PAGE_SIZE * mm->total_vm; } unsigned long task_statm(struct mm_struct *mm, unsigned long *shared, unsigned long *text, unsigned long *data, unsigned long *resident) { *shared = get_mm_counter(mm, MM_FILEPAGES) + get_mm_counter(mm, MM_SHMEMPAGES); *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> PAGE_SHIFT; *data = mm->data_vm + mm->stack_vm; *resident = *shared + get_mm_counter(mm, MM_ANONPAGES); return mm->total_vm; } #ifdef CONFIG_NUMA /* * Save get_task_policy() for show_numa_map(). */ static void hold_task_mempolicy(struct proc_maps_private *priv) { struct task_struct *task = priv->task; task_lock(task); priv->task_mempolicy = get_task_policy(task); mpol_get(priv->task_mempolicy); task_unlock(task); } static void release_task_mempolicy(struct proc_maps_private *priv) { mpol_put(priv->task_mempolicy); } #else static void hold_task_mempolicy(struct proc_maps_private *priv) { } static void release_task_mempolicy(struct proc_maps_private *priv) { } #endif static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv, loff_t *ppos) { struct vm_area_struct *vma = vma_next(&priv->iter); if (vma) { *ppos = vma->vm_start; } else { *ppos = -2UL; vma = get_gate_vma(priv->mm); } return vma; } static void *m_start(struct seq_file *m, loff_t *ppos) { struct proc_maps_private *priv = m->private; unsigned long last_addr = *ppos; struct mm_struct *mm; /* See m_next(). Zero at the start or after lseek. */ if (last_addr == -1UL) return NULL; priv->task = get_proc_task(priv->inode); if (!priv->task) return ERR_PTR(-ESRCH); mm = priv->mm; if (!mm || !mmget_not_zero(mm)) { put_task_struct(priv->task); priv->task = NULL; return NULL; } if (mmap_read_lock_killable(mm)) { mmput(mm); put_task_struct(priv->task); priv->task = NULL; return ERR_PTR(-EINTR); } vma_iter_init(&priv->iter, mm, last_addr); hold_task_mempolicy(priv); if (last_addr == -2UL) return get_gate_vma(mm); return proc_get_vma(priv, ppos); } static void *m_next(struct seq_file *m, void *v, loff_t *ppos) { if (*ppos == -2UL) { *ppos = -1UL; return NULL; } return proc_get_vma(m->private, ppos); } static void m_stop(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; struct mm_struct *mm = priv->mm; if (!priv->task) return; release_task_mempolicy(priv); mmap_read_unlock(mm); mmput(mm); put_task_struct(priv->task); priv->task = NULL; } static int proc_maps_open(struct inode *inode, struct file *file, const struct seq_operations *ops, int psize) { struct proc_maps_private *priv = __seq_open_private(file, ops, psize); if (!priv) return -ENOMEM; priv->inode = inode; priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); if (IS_ERR(priv->mm)) { int err = PTR_ERR(priv->mm); seq_release_private(inode, file); return err; } return 0; } static int proc_map_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; struct proc_maps_private *priv = seq->private; if (priv->mm) mmdrop(priv->mm); return seq_release_private(inode, file); } static int do_maps_open(struct inode *inode, struct file *file, const struct seq_operations *ops) { return proc_maps_open(inode, file, ops, sizeof(struct proc_maps_private)); } static void get_vma_name(struct vm_area_struct *vma, const struct path **path, const char **name, const char **name_fmt) { struct anon_vma_name *anon_name = vma->vm_mm ? anon_vma_name(vma) : NULL; *name = NULL; *path = NULL; *name_fmt = NULL; /* * Print the dentry name for named mappings, and a * special [heap] marker for the heap: */ if (vma->vm_file) { /* * If user named this anon shared memory via * prctl(PR_SET_VMA ..., use the provided name. */ if (anon_name) { *name_fmt = "[anon_shmem:%s]"; *name = anon_name->name; } else { *path = file_user_path(vma->vm_file); } return; } if (vma->vm_ops && vma->vm_ops->name) { *name = vma->vm_ops->name(vma); if (*name) return; } *name = arch_vma_name(vma); if (*name) return; if (!vma->vm_mm) { *name = "[vdso]"; return; } if (vma_is_initial_heap(vma)) { *name = "[heap]"; return; } if (vma_is_initial_stack(vma)) { *name = "[stack]"; return; } if (anon_name) { *name_fmt = "[anon:%s]"; *name = anon_name->name; return; } } static void show_vma_header_prefix(struct seq_file *m, unsigned long start, unsigned long end, vm_flags_t flags, unsigned long long pgoff, dev_t dev, unsigned long ino) { seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); seq_put_hex_ll(m, NULL, start, 8); seq_put_hex_ll(m, "-", end, 8); seq_putc(m, ' '); seq_putc(m, flags & VM_READ ? 'r' : '-'); seq_putc(m, flags & VM_WRITE ? 'w' : '-'); seq_putc(m, flags & VM_EXEC ? 'x' : '-'); seq_putc(m, flags & VM_MAYSHARE ? 's' : 'p'); seq_put_hex_ll(m, " ", pgoff, 8); seq_put_hex_ll(m, " ", MAJOR(dev), 2); seq_put_hex_ll(m, ":", MINOR(dev), 2); seq_put_decimal_ull(m, " ", ino); seq_putc(m, ' '); } static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) { const struct path *path; const char *name_fmt, *name; vm_flags_t flags = vma->vm_flags; unsigned long ino = 0; unsigned long long pgoff = 0; unsigned long start, end; dev_t dev = 0; if (vma->vm_file) { const struct inode *inode = file_user_inode(vma->vm_file); dev = inode->i_sb->s_dev; ino = inode->i_ino; pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; } start = vma->vm_start; end = vma->vm_end; show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino); get_vma_name(vma, &path, &name, &name_fmt); if (path) { seq_pad(m, ' '); seq_path(m, path, "\n"); } else if (name_fmt) { seq_pad(m, ' '); seq_printf(m, name_fmt, name); } else if (name) { seq_pad(m, ' '); seq_puts(m, name); } seq_putc(m, '\n'); } static int show_map(struct seq_file *m, void *v) { show_map_vma(m, v); return 0; } static const struct seq_operations proc_pid_maps_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = show_map }; static int pid_maps_open(struct inode *inode, struct file *file) { return do_maps_open(inode, file, &proc_pid_maps_op); } #define PROCMAP_QUERY_VMA_FLAGS ( \ PROCMAP_QUERY_VMA_READABLE | \ PROCMAP_QUERY_VMA_WRITABLE | \ PROCMAP_QUERY_VMA_EXECUTABLE | \ PROCMAP_QUERY_VMA_SHARED \ ) #define PROCMAP_QUERY_VALID_FLAGS_MASK ( \ PROCMAP_QUERY_COVERING_OR_NEXT_VMA | \ PROCMAP_QUERY_FILE_BACKED_VMA | \ PROCMAP_QUERY_VMA_FLAGS \ ) static int query_vma_setup(struct mm_struct *mm) { return mmap_read_lock_killable(mm); } static void query_vma_teardown(struct mm_struct *mm, struct vm_area_struct *vma) { mmap_read_unlock(mm); } static struct vm_area_struct *query_vma_find_by_addr(struct mm_struct *mm, unsigned long addr) { return find_vma(mm, addr); } static struct vm_area_struct *query_matching_vma(struct mm_struct *mm, unsigned long addr, u32 flags) { struct vm_area_struct *vma; next_vma: vma = query_vma_find_by_addr(mm, addr); if (!vma) goto no_vma; /* user requested only file-backed VMA, keep iterating */ if ((flags & PROCMAP_QUERY_FILE_BACKED_VMA) && !vma->vm_file) goto skip_vma; /* VMA permissions should satisfy query flags */ if (flags & PROCMAP_QUERY_VMA_FLAGS) { u32 perm = 0; if (flags & PROCMAP_QUERY_VMA_READABLE) perm |= VM_READ; if (flags & PROCMAP_QUERY_VMA_WRITABLE) perm |= VM_WRITE; if (flags & PROCMAP_QUERY_VMA_EXECUTABLE) perm |= VM_EXEC; if (flags & PROCMAP_QUERY_VMA_SHARED) perm |= VM_MAYSHARE; if ((vma->vm_flags & perm) != perm) goto skip_vma; } /* found covering VMA or user is OK with the matching next VMA */ if ((flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA) || vma->vm_start <= addr) return vma; skip_vma: /* * If the user needs closest matching VMA, keep iterating. */ addr = vma->vm_end; if (flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA) goto next_vma; no_vma: return ERR_PTR(-ENOENT); } static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg) { struct procmap_query karg; struct vm_area_struct *vma; struct mm_struct *mm; const char *name = NULL; char build_id_buf[BUILD_ID_SIZE_MAX], *name_buf = NULL; __u64 usize; int err; if (copy_from_user(&usize, (void __user *)uarg, sizeof(usize))) return -EFAULT; /* argument struct can never be that large, reject abuse */ if (usize > PAGE_SIZE) return -E2BIG; /* argument struct should have at least query_flags and query_addr fields */ if (usize < offsetofend(struct procmap_query, query_addr)) return -EINVAL; err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize); if (err) return err; /* reject unknown flags */ if (karg.query_flags & ~PROCMAP_QUERY_VALID_FLAGS_MASK) return -EINVAL; /* either both buffer address and size are set, or both should be zero */ if (!!karg.vma_name_size != !!karg.vma_name_addr) return -EINVAL; if (!!karg.build_id_size != !!karg.build_id_addr) return -EINVAL; mm = priv->mm; if (!mm || !mmget_not_zero(mm)) return -ESRCH; err = query_vma_setup(mm); if (err) { mmput(mm); return err; } vma = query_matching_vma(mm, karg.query_addr, karg.query_flags); if (IS_ERR(vma)) { err = PTR_ERR(vma); vma = NULL; goto out; } karg.vma_start = vma->vm_start; karg.vma_end = vma->vm_end; karg.vma_flags = 0; if (vma->vm_flags & VM_READ) karg.vma_flags |= PROCMAP_QUERY_VMA_READABLE; if (vma->vm_flags & VM_WRITE) karg.vma_flags |= PROCMAP_QUERY_VMA_WRITABLE; if (vma->vm_flags & VM_EXEC) karg.vma_flags |= PROCMAP_QUERY_VMA_EXECUTABLE; if (vma->vm_flags & VM_MAYSHARE) karg.vma_flags |= PROCMAP_QUERY_VMA_SHARED; karg.vma_page_size = vma_kernel_pagesize(vma); if (vma->vm_file) { const struct inode *inode = file_user_inode(vma->vm_file); karg.vma_offset = ((__u64)vma->vm_pgoff) << PAGE_SHIFT; karg.dev_major = MAJOR(inode->i_sb->s_dev); karg.dev_minor = MINOR(inode->i_sb->s_dev); karg.inode = inode->i_ino; } else { karg.vma_offset = 0; karg.dev_major = 0; karg.dev_minor = 0; karg.inode = 0; } if (karg.build_id_size) { __u32 build_id_sz; err = build_id_parse(vma, build_id_buf, &build_id_sz); if (err) { karg.build_id_size = 0; } else { if (karg.build_id_size < build_id_sz) { err = -ENAMETOOLONG; goto out; } karg.build_id_size = build_id_sz; } } if (karg.vma_name_size) { size_t name_buf_sz = min_t(size_t, PATH_MAX, karg.vma_name_size); const struct path *path; const char *name_fmt; size_t name_sz = 0; get_vma_name(vma, &path, &name, &name_fmt); if (path || name_fmt || name) { name_buf = kmalloc(name_buf_sz, GFP_KERNEL); if (!name_buf) { err = -ENOMEM; goto out; } } if (path) { name = d_path(path, name_buf, name_buf_sz); if (IS_ERR(name)) { err = PTR_ERR(name); goto out; } name_sz = name_buf + name_buf_sz - name; } else if (name || name_fmt) { name_sz = 1 + snprintf(name_buf, name_buf_sz, name_fmt ?: "%s", name); name = name_buf; } if (name_sz > name_buf_sz) { err = -ENAMETOOLONG; goto out; } karg.vma_name_size = name_sz; } /* unlock vma or mmap_lock, and put mm_struct before copying data to user */ query_vma_teardown(mm, vma); mmput(mm); if (karg.vma_name_size && copy_to_user(u64_to_user_ptr(karg.vma_name_addr), name, karg.vma_name_size)) { kfree(name_buf); return -EFAULT; } kfree(name_buf); if (karg.build_id_size && copy_to_user(u64_to_user_ptr(karg.build_id_addr), build_id_buf, karg.build_id_size)) return -EFAULT; if (copy_to_user(uarg, &karg, min_t(size_t, sizeof(karg), usize))) return -EFAULT; return 0; out: query_vma_teardown(mm, vma); mmput(mm); kfree(name_buf); return err; } static long procfs_procmap_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct seq_file *seq = file->private_data; struct proc_maps_private *priv = seq->private; switch (cmd) { case PROCMAP_QUERY: return do_procmap_query(priv, (void __user *)arg); default: return -ENOIOCTLCMD; } } const struct file_operations proc_pid_maps_operations = { .open = pid_maps_open, .read = seq_read, .llseek = seq_lseek, .release = proc_map_release, .unlocked_ioctl = procfs_procmap_ioctl, .compat_ioctl = compat_ptr_ioctl, }; /* * Proportional Set Size(PSS): my share of RSS. * * PSS of a process is the count of pages it has in memory, where each * page is divided by the number of processes sharing it. So if a * process has 1000 pages all to itself, and 1000 shared with one other * process, its PSS will be 1500. * * To keep (accumulated) division errors low, we adopt a 64bit * fixed-point pss counter to minimize division errors. So (pss >> * PSS_SHIFT) would be the real byte count. * * A shift of 12 before division means (assuming 4K page size): * - 1M 3-user-pages add up to 8KB errors; * - supports mapcount up to 2^24, or 16M; * - supports PSS up to 2^52 bytes, or 4PB. */ #define PSS_SHIFT 12 #ifdef CONFIG_PROC_PAGE_MONITOR struct mem_size_stats { unsigned long resident; unsigned long shared_clean; unsigned long shared_dirty; unsigned long private_clean; unsigned long private_dirty; unsigned long referenced; unsigned long anonymous; unsigned long lazyfree; unsigned long anonymous_thp; unsigned long shmem_thp; unsigned long file_thp; unsigned long swap; unsigned long shared_hugetlb; unsigned long private_hugetlb; unsigned long ksm; u64 pss; u64 pss_anon; u64 pss_file; u64 pss_shmem; u64 pss_dirty; u64 pss_locked; u64 swap_pss; }; static void smaps_page_accumulate(struct mem_size_stats *mss, struct folio *folio, unsigned long size, unsigned long pss, bool dirty, bool locked, bool private) { mss->pss += pss; if (folio_test_anon(folio)) mss->pss_anon += pss; else if (folio_test_swapbacked(folio)) mss->pss_shmem += pss; else mss->pss_file += pss; if (locked) mss->pss_locked += pss; if (dirty || folio_test_dirty(folio)) { mss->pss_dirty += pss; if (private) mss->private_dirty += size; else mss->shared_dirty += size; } else { if (private) mss->private_clean += size; else mss->shared_clean += size; } } static void smaps_account(struct mem_size_stats *mss, struct page *page, bool compound, bool young, bool dirty, bool locked, bool present) { struct folio *folio = page_folio(page); int i, nr = compound ? compound_nr(page) : 1; unsigned long size = nr * PAGE_SIZE; /* * First accumulate quantities that depend only on |size| and the type * of the compound page. */ if (folio_test_anon(folio)) { mss->anonymous += size; if (!folio_test_swapbacked(folio) && !dirty && !folio_test_dirty(folio)) mss->lazyfree += size; } if (folio_test_ksm(folio)) mss->ksm += size; mss->resident += size; /* Accumulate the size in pages that have been accessed. */ if (young || folio_test_young(folio) || folio_test_referenced(folio)) mss->referenced += size; /* * Then accumulate quantities that may depend on sharing, or that may * differ page-by-page. * * refcount == 1 for present entries guarantees that the folio is mapped * exactly once. For large folios this implies that exactly one * PTE/PMD/... maps (a part of) this folio. * * Treat all non-present entries (where relying on the mapcount and * refcount doesn't make sense) as "maybe shared, but not sure how * often". We treat device private entries as being fake-present. * * Note that it would not be safe to read the mapcount especially for * pages referenced by migration entries, even with the PTL held. */ if (folio_ref_count(folio) == 1 || !present) { smaps_page_accumulate(mss, folio, size, size << PSS_SHIFT, dirty, locked, present); return; } /* * We obtain a snapshot of the mapcount. Without holding the folio lock * this snapshot can be slightly wrong as we cannot always read the * mapcount atomically. */ for (i = 0; i < nr; i++, page++) { int mapcount = folio_precise_page_mapcount(folio, page); unsigned long pss = PAGE_SIZE << PSS_SHIFT; if (mapcount >= 2) pss /= mapcount; smaps_page_accumulate(mss, folio, PAGE_SIZE, pss, dirty, locked, mapcount < 2); } } #ifdef CONFIG_SHMEM static int smaps_pte_hole(unsigned long addr, unsigned long end, __always_unused int depth, struct mm_walk *walk) { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; mss->swap += shmem_partial_swap_usage(walk->vma->vm_file->f_mapping, linear_page_index(vma, addr), linear_page_index(vma, end)); return 0; } #else #define smaps_pte_hole NULL #endif /* CONFIG_SHMEM */ static void smaps_pte_hole_lookup(unsigned long addr, struct mm_walk *walk) { #ifdef CONFIG_SHMEM if (walk->ops->pte_hole) { /* depth is not used */ smaps_pte_hole(addr, addr + PAGE_SIZE, 0, walk); } #endif } static void smaps_pte_entry(pte_t *pte, unsigned long addr, struct mm_walk *walk) { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; bool present = false, young = false, dirty = false; pte_t ptent = ptep_get(pte); if (pte_present(ptent)) { page = vm_normal_page(vma, addr, ptent); young = pte_young(ptent); dirty = pte_dirty(ptent); present = true; } else if (is_swap_pte(ptent)) { swp_entry_t swpent = pte_to_swp_entry(ptent); if (!non_swap_entry(swpent)) { int mapcount; mss->swap += PAGE_SIZE; mapcount = swp_swapcount(swpent); if (mapcount >= 2) { u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT; do_div(pss_delta, mapcount); mss->swap_pss += pss_delta; } else { mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT; } } else if (is_pfn_swap_entry(swpent)) { if (is_device_private_entry(swpent)) present = true; page = pfn_swap_entry_to_page(swpent); } } else { smaps_pte_hole_lookup(addr, walk); return; } if (!page) return; smaps_account(mss, page, false, young, dirty, locked, present); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, struct mm_walk *walk) { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; bool present = false; struct folio *folio; if (pmd_present(*pmd)) { page = vm_normal_page_pmd(vma, addr, *pmd); present = true; } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) { swp_entry_t entry = pmd_to_swp_entry(*pmd); if (is_pfn_swap_entry(entry)) page = pfn_swap_entry_to_page(entry); } if (IS_ERR_OR_NULL(page)) return; folio = page_folio(page); if (folio_test_anon(folio)) mss->anonymous_thp += HPAGE_PMD_SIZE; else if (folio_test_swapbacked(folio)) mss->shmem_thp += HPAGE_PMD_SIZE; else if (folio_is_zone_device(folio)) /* pass */; else mss->file_thp += HPAGE_PMD_SIZE; smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked, present); } #else static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, struct mm_walk *walk) { } #endif static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; pte_t *pte; spinlock_t *ptl; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { smaps_pmd_entry(pmd, addr, walk); spin_unlock(ptl); goto out; } pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (!pte) { walk->action = ACTION_AGAIN; return 0; } for (; addr != end; pte++, addr += PAGE_SIZE) smaps_pte_entry(pte, addr, walk); pte_unmap_unlock(pte - 1, ptl); out: cond_resched(); return 0; } static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) { /* * Don't forget to update Documentation/ on changes. * * The length of the second argument of mnemonics[] * needs to be 3 instead of previously set 2 * (i.e. from [BITS_PER_LONG][2] to [BITS_PER_LONG][3]) * to avoid spurious * -Werror=unterminated-string-initialization warning * with GCC 15 */ static const char mnemonics[BITS_PER_LONG][3] = { /* * In case if we meet a flag we don't know about. */ [0 ... (BITS_PER_LONG-1)] = "??", [ilog2(VM_READ)] = "rd", [ilog2(VM_WRITE)] = "wr", [ilog2(VM_EXEC)] = "ex", [ilog2(VM_SHARED)] = "sh", [ilog2(VM_MAYREAD)] = "mr", [ilog2(VM_MAYWRITE)] = "mw", [ilog2(VM_MAYEXEC)] = "me", [ilog2(VM_MAYSHARE)] = "ms", [ilog2(VM_GROWSDOWN)] = "gd", [ilog2(VM_PFNMAP)] = "pf", [ilog2(VM_LOCKED)] = "lo", [ilog2(VM_IO)] = "io", [ilog2(VM_SEQ_READ)] = "sr", [ilog2(VM_RAND_READ)] = "rr", [ilog2(VM_DONTCOPY)] = "dc", [ilog2(VM_DONTEXPAND)] = "de", [ilog2(VM_LOCKONFAULT)] = "lf", [ilog2(VM_ACCOUNT)] = "ac", [ilog2(VM_NORESERVE)] = "nr", [ilog2(VM_HUGETLB)] = "ht", [ilog2(VM_SYNC)] = "sf", [ilog2(VM_ARCH_1)] = "ar", [ilog2(VM_WIPEONFORK)] = "wf", [ilog2(VM_DONTDUMP)] = "dd", #ifdef CONFIG_ARM64_BTI [ilog2(VM_ARM64_BTI)] = "bt", #endif #ifdef CONFIG_MEM_SOFT_DIRTY [ilog2(VM_SOFTDIRTY)] = "sd", #endif [ilog2(VM_MIXEDMAP)] = "mm", [ilog2(VM_HUGEPAGE)] = "hg", [ilog2(VM_NOHUGEPAGE)] = "nh", [ilog2(VM_MERGEABLE)] = "mg", [ilog2(VM_UFFD_MISSING)]= "um", [ilog2(VM_UFFD_WP)] = "uw", #ifdef CONFIG_ARM64_MTE [ilog2(VM_MTE)] = "mt", [ilog2(VM_MTE_ALLOWED)] = "", #endif #ifdef CONFIG_ARCH_HAS_PKEYS /* These come out via ProtectionKey: */ [ilog2(VM_PKEY_BIT0)] = "", [ilog2(VM_PKEY_BIT1)] = "", [ilog2(VM_PKEY_BIT2)] = "", #if VM_PKEY_BIT3 [ilog2(VM_PKEY_BIT3)] = "", #endif #if VM_PKEY_BIT4 [ilog2(VM_PKEY_BIT4)] = "", #endif #endif /* CONFIG_ARCH_HAS_PKEYS */ #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR [ilog2(VM_UFFD_MINOR)] = "ui", #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ #ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK [ilog2(VM_SHADOW_STACK)] = "ss", #endif #if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) [ilog2(VM_DROPPABLE)] = "dp", #endif #ifdef CONFIG_64BIT [ilog2(VM_SEALED)] = "sl", #endif }; size_t i; seq_puts(m, "VmFlags: "); for (i = 0; i < BITS_PER_LONG; i++) { if (!mnemonics[i][0]) continue; if (vma->vm_flags & (1UL << i)) seq_printf(m, "%s ", mnemonics[i]); } seq_putc(m, '\n'); } #ifdef CONFIG_HUGETLB_PAGE static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; pte_t ptent = huge_ptep_get(walk->mm, addr, pte); struct folio *folio = NULL; bool present = false; if (pte_present(ptent)) { folio = page_folio(pte_page(ptent)); present = true; } else if (is_swap_pte(ptent)) { swp_entry_t swpent = pte_to_swp_entry(ptent); if (is_pfn_swap_entry(swpent)) folio = pfn_swap_entry_folio(swpent); } if (folio) { /* We treat non-present entries as "maybe shared". */ if (!present || folio_likely_mapped_shared(folio) || hugetlb_pmd_shared(pte)) mss->shared_hugetlb += huge_page_size(hstate_vma(vma)); else mss->private_hugetlb += huge_page_size(hstate_vma(vma)); } return 0; } #else #define smaps_hugetlb_range NULL #endif /* HUGETLB_PAGE */ static const struct mm_walk_ops smaps_walk_ops = { .pmd_entry = smaps_pte_range, .hugetlb_entry = smaps_hugetlb_range, .walk_lock = PGWALK_RDLOCK, }; static const struct mm_walk_ops smaps_shmem_walk_ops = { .pmd_entry = smaps_pte_range, .hugetlb_entry = smaps_hugetlb_range, .pte_hole = smaps_pte_hole, .walk_lock = PGWALK_RDLOCK, }; /* * Gather mem stats from @vma with the indicated beginning * address @start, and keep them in @mss. * * Use vm_start of @vma as the beginning address if @start is 0. */ static void smap_gather_stats(struct vm_area_struct *vma, struct mem_size_stats *mss, unsigned long start) { const struct mm_walk_ops *ops = &smaps_walk_ops; /* Invalid start */ if (start >= vma->vm_end) return; if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { /* * For shared or readonly shmem mappings we know that all * swapped out pages belong to the shmem object, and we can * obtain the swap value much more efficiently. For private * writable mappings, we might have COW pages that are * not affected by the parent swapped out pages of the shmem * object, so we have to distinguish them during the page walk. * Unless we know that the shmem object (or the part mapped by * our VMA) has no swapped out pages at all. */ unsigned long shmem_swapped = shmem_swap_usage(vma); if (!start && (!shmem_swapped || (vma->vm_flags & VM_SHARED) || !(vma->vm_flags & VM_WRITE))) { mss->swap += shmem_swapped; } else { ops = &smaps_shmem_walk_ops; } } /* mmap_lock is held in m_start */ if (!start) walk_page_vma(vma, ops, mss); else walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss); } #define SEQ_PUT_DEC(str, val) \ seq_put_decimal_ull_width(m, str, (val) >> 10, 8) /* Show the contents common for smaps and smaps_rollup */ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, bool rollup_mode) { SEQ_PUT_DEC("Rss: ", mss->resident); SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT); SEQ_PUT_DEC(" kB\nPss_Dirty: ", mss->pss_dirty >> PSS_SHIFT); if (rollup_mode) { /* * These are meaningful only for smaps_rollup, otherwise two of * them are zero, and the other one is the same as Pss. */ SEQ_PUT_DEC(" kB\nPss_Anon: ", mss->pss_anon >> PSS_SHIFT); SEQ_PUT_DEC(" kB\nPss_File: ", mss->pss_file >> PSS_SHIFT); SEQ_PUT_DEC(" kB\nPss_Shmem: ", mss->pss_shmem >> PSS_SHIFT); } SEQ_PUT_DEC(" kB\nShared_Clean: ", mss->shared_clean); SEQ_PUT_DEC(" kB\nShared_Dirty: ", mss->shared_dirty); SEQ_PUT_DEC(" kB\nPrivate_Clean: ", mss->private_clean); SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty); SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced); SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous); SEQ_PUT_DEC(" kB\nKSM: ", mss->ksm); SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree); SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp); SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp); SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp); SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb); seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ", mss->private_hugetlb >> 10, 7); SEQ_PUT_DEC(" kB\nSwap: ", mss->swap); SEQ_PUT_DEC(" kB\nSwapPss: ", mss->swap_pss >> PSS_SHIFT); SEQ_PUT_DEC(" kB\nLocked: ", mss->pss_locked >> PSS_SHIFT); seq_puts(m, " kB\n"); } static int show_smap(struct seq_file *m, void *v) { struct vm_area_struct *vma = v; struct mem_size_stats mss = {}; smap_gather_stats(vma, &mss, 0); show_map_vma(m, vma); SEQ_PUT_DEC("Size: ", vma->vm_end - vma->vm_start); SEQ_PUT_DEC(" kB\nKernelPageSize: ", vma_kernel_pagesize(vma)); SEQ_PUT_DEC(" kB\nMMUPageSize: ", vma_mmu_pagesize(vma)); seq_puts(m, " kB\n"); __show_smap(m, &mss, false); seq_printf(m, "THPeligible: %8u\n", !!thp_vma_allowable_orders(vma, vma->vm_flags, TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); show_smap_vma_flags(m, vma); return 0; } static int show_smaps_rollup(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; struct mem_size_stats mss = {}; struct mm_struct *mm = priv->mm; struct vm_area_struct *vma; unsigned long vma_start = 0, last_vma_end = 0; int ret = 0; VMA_ITERATOR(vmi, mm, 0); priv->task = get_proc_task(priv->inode); if (!priv->task) return -ESRCH; if (!mm || !mmget_not_zero(mm)) { ret = -ESRCH; goto out_put_task; } ret = mmap_read_lock_killable(mm); if (ret) goto out_put_mm; hold_task_mempolicy(priv); vma = vma_next(&vmi); if (unlikely(!vma)) goto empty_set; vma_start = vma->vm_start; do { smap_gather_stats(vma, &mss, 0); last_vma_end = vma->vm_end; /* * Release mmap_lock temporarily if someone wants to * access it for write request. */ if (mmap_lock_is_contended(mm)) { vma_iter_invalidate(&vmi); mmap_read_unlock(mm); ret = mmap_read_lock_killable(mm); if (ret) { release_task_mempolicy(priv); goto out_put_mm; } /* * After dropping the lock, there are four cases to * consider. See the following example for explanation. * * +------+------+-----------+ * | VMA1 | VMA2 | VMA3 | * +------+------+-----------+ * | | | | * 4k 8k 16k 400k * * Suppose we drop the lock after reading VMA2 due to * contention, then we get: * * last_vma_end = 16k * * 1) VMA2 is freed, but VMA3 exists: * * vma_next(vmi) will return VMA3. * In this case, just continue from VMA3. * * 2) VMA2 still exists: * * vma_next(vmi) will return VMA3. * In this case, just continue from VMA3. * * 3) No more VMAs can be found: * * vma_next(vmi) will return NULL. * No more things to do, just break. * * 4) (last_vma_end - 1) is the middle of a vma (VMA'): * * vma_next(vmi) will return VMA' whose range * contains last_vma_end. * Iterate VMA' from last_vma_end. */ vma = vma_next(&vmi); /* Case 3 above */ if (!vma) break; /* Case 1 and 2 above */ if (vma->vm_start >= last_vma_end) { smap_gather_stats(vma, &mss, 0); last_vma_end = vma->vm_end; continue; } /* Case 4 above */ if (vma->vm_end > last_vma_end) { smap_gather_stats(vma, &mss, last_vma_end); last_vma_end = vma->vm_end; } } } for_each_vma(vmi, vma); empty_set: show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0); seq_pad(m, ' '); seq_puts(m, "[rollup]\n"); __show_smap(m, &mss, true); release_task_mempolicy(priv); mmap_read_unlock(mm); out_put_mm: mmput(mm); out_put_task: put_task_struct(priv->task); priv->task = NULL; return ret; } #undef SEQ_PUT_DEC static const struct seq_operations proc_pid_smaps_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = show_smap }; static int pid_smaps_open(struct inode *inode, struct file *file) { return do_maps_open(inode, file, &proc_pid_smaps_op); } static int smaps_rollup_open(struct inode *inode, struct file *file) { int ret; struct proc_maps_private *priv; priv = kzalloc(sizeof(*priv), GFP_KERNEL_ACCOUNT); if (!priv) return -ENOMEM; ret = single_open(file, show_smaps_rollup, priv); if (ret) goto out_free; priv->inode = inode; priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); if (IS_ERR(priv->mm)) { ret = PTR_ERR(priv->mm); single_release(inode, file); goto out_free; } return 0; out_free: kfree(priv); return ret; } static int smaps_rollup_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; struct proc_maps_private *priv = seq->private; if (priv->mm) mmdrop(priv->mm); kfree(priv); return single_release(inode, file); } const struct file_operations proc_pid_smaps_operations = { .open = pid_smaps_open, .read = seq_read, .llseek = seq_lseek, .release = proc_map_release, }; const struct file_operations proc_pid_smaps_rollup_operations = { .open = smaps_rollup_open, .read = seq_read, .llseek = seq_lseek, .release = smaps_rollup_release, }; enum clear_refs_types { CLEAR_REFS_ALL = 1, CLEAR_REFS_ANON, CLEAR_REFS_MAPPED, CLEAR_REFS_SOFT_DIRTY, CLEAR_REFS_MM_HIWATER_RSS, CLEAR_REFS_LAST, }; struct clear_refs_private { enum clear_refs_types type; }; #ifdef CONFIG_MEM_SOFT_DIRTY static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { struct folio *folio; if (!pte_write(pte)) return false; if (!is_cow_mapping(vma->vm_flags)) return false; if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))) return false; folio = vm_normal_folio(vma, addr, pte); if (!folio) return false; return folio_maybe_dma_pinned(folio); } static inline void clear_soft_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *pte) { /* * The soft-dirty tracker uses #PF-s to catch writes * to pages, so write-protect the pte as well. See the * Documentation/admin-guide/mm/soft-dirty.rst for full description * of how soft-dirty works. */ pte_t ptent = ptep_get(pte); if (pte_present(ptent)) { pte_t old_pte; if (pte_is_pinned(vma, addr, ptent)) return; old_pte = ptep_modify_prot_start(vma, addr, pte); ptent = pte_wrprotect(old_pte); ptent = pte_clear_soft_dirty(ptent); ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); } else if (is_swap_pte(ptent)) { ptent = pte_swp_clear_soft_dirty(ptent); set_pte_at(vma->vm_mm, addr, pte, ptent); } } #else static inline void clear_soft_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *pte) { } #endif #if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE) static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { pmd_t old, pmd = *pmdp; if (pmd_present(pmd)) { /* See comment in change_huge_pmd() */ old = pmdp_invalidate(vma, addr, pmdp); if (pmd_dirty(old)) pmd = pmd_mkdirty(pmd); if (pmd_young(old)) pmd = pmd_mkyoung(pmd); pmd = pmd_wrprotect(pmd); pmd = pmd_clear_soft_dirty(pmd); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { pmd = pmd_swp_clear_soft_dirty(pmd); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } } #else static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { } #endif static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct clear_refs_private *cp = walk->private; struct vm_area_struct *vma = walk->vma; pte_t *pte, ptent; spinlock_t *ptl; struct folio *folio; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { if (cp->type == CLEAR_REFS_SOFT_DIRTY) { clear_soft_dirty_pmd(vma, addr, pmd); goto out; } if (!pmd_present(*pmd)) goto out; folio = pmd_folio(*pmd); /* Clear accessed and referenced bits. */ pmdp_test_and_clear_young(vma, addr, pmd); folio_test_clear_young(folio); folio_clear_referenced(folio); out: spin_unlock(ptl); return 0; } pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (!pte) { walk->action = ACTION_AGAIN; return 0; } for (; addr != end; pte++, addr += PAGE_SIZE) { ptent = ptep_get(pte); if (cp->type == CLEAR_REFS_SOFT_DIRTY) { clear_soft_dirty(vma, addr, pte); continue; } if (!pte_present(ptent)) continue; folio = vm_normal_folio(vma, addr, ptent); if (!folio) continue; /* Clear accessed and referenced bits. */ ptep_test_and_clear_young(vma, addr, pte); folio_test_clear_young(folio); folio_clear_referenced(folio); } pte_unmap_unlock(pte - 1, ptl); cond_resched(); return 0; } static int clear_refs_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk) { struct clear_refs_private *cp = walk->private; struct vm_area_struct *vma = walk->vma; if (vma->vm_flags & VM_PFNMAP) return 1; /* * Writing 1 to /proc/pid/clear_refs affects all pages. * Writing 2 to /proc/pid/clear_refs only affects anonymous pages. * Writing 3 to /proc/pid/clear_refs only affects file mapped pages. * Writing 4 to /proc/pid/clear_refs affects all pages. */ if (cp->type == CLEAR_REFS_ANON && vma->vm_file) return 1; if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file) return 1; return 0; } static const struct mm_walk_ops clear_refs_walk_ops = { .pmd_entry = clear_refs_pte_range, .test_walk = clear_refs_test_walk, .walk_lock = PGWALK_WRLOCK, }; static ssize_t clear_refs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task; char buffer[PROC_NUMBUF] = {}; struct mm_struct *mm; struct vm_area_struct *vma; enum clear_refs_types type; int itype; int rv; if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; rv = kstrtoint(strstrip(buffer), 10, &itype); if (rv < 0) return rv; type = (enum clear_refs_types)itype; if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) return -EINVAL; task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; mm = get_task_mm(task); if (mm) { VMA_ITERATOR(vmi, mm, 0); struct mmu_notifier_range range; struct clear_refs_private cp = { .type = type, }; if (mmap_write_lock_killable(mm)) { count = -EINTR; goto out_mm; } if (type == CLEAR_REFS_MM_HIWATER_RSS) { /* * Writing 5 to /proc/pid/clear_refs resets the peak * resident set size to this mm's current rss value. */ reset_mm_hiwater_rss(mm); goto out_unlock; } if (type == CLEAR_REFS_SOFT_DIRTY) { for_each_vma(vmi, vma) { if (!(vma->vm_flags & VM_SOFTDIRTY)) continue; vm_flags_clear(vma, VM_SOFTDIRTY); vma_set_page_prot(vma); } inc_tlb_flush_pending(mm); mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, 0, mm, 0, -1UL); mmu_notifier_invalidate_range_start(&range); } walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp); if (type == CLEAR_REFS_SOFT_DIRTY) { mmu_notifier_invalidate_range_end(&range); flush_tlb_mm(mm); dec_tlb_flush_pending(mm); } out_unlock: mmap_write_unlock(mm); out_mm: mmput(mm); } put_task_struct(task); return count; } const struct file_operations proc_clear_refs_operations = { .write = clear_refs_write, .llseek = noop_llseek, }; typedef struct { u64 pme; } pagemap_entry_t; struct pagemapread { int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ pagemap_entry_t *buffer; bool show_pfn; }; #define PAGEMAP_WALK_SIZE (PMD_SIZE) #define PAGEMAP_WALK_MASK (PMD_MASK) #define PM_ENTRY_BYTES sizeof(pagemap_entry_t) #define PM_PFRAME_BITS 55 #define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0) #define PM_SOFT_DIRTY BIT_ULL(55) #define PM_MMAP_EXCLUSIVE BIT_ULL(56) #define PM_UFFD_WP BIT_ULL(57) #define PM_FILE BIT_ULL(61) #define PM_SWAP BIT_ULL(62) #define PM_PRESENT BIT_ULL(63) #define PM_END_OF_BUFFER 1 static inline pagemap_entry_t make_pme(u64 frame, u64 flags) { return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags }; } static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm) { pm->buffer[pm->pos++] = *pme; if (pm->pos >= pm->len) return PM_END_OF_BUFFER; return 0; } static int pagemap_pte_hole(unsigned long start, unsigned long end, __always_unused int depth, struct mm_walk *walk) { struct pagemapread *pm = walk->private; unsigned long addr = start; int err = 0; while (addr < end) { struct vm_area_struct *vma = find_vma(walk->mm, addr); pagemap_entry_t pme = make_pme(0, 0); /* End of address space hole, which we mark as non-present. */ unsigned long hole_end; if (vma) hole_end = min(end, vma->vm_start); else hole_end = end; for (; addr < hole_end; addr += PAGE_SIZE) { err = add_to_pagemap(&pme, pm); if (err) goto out; } if (!vma) break; /* Addresses in the VMA. */ if (vma->vm_flags & VM_SOFTDIRTY) pme = make_pme(0, PM_SOFT_DIRTY); for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) { err = add_to_pagemap(&pme, pm); if (err) goto out; } } out: return err; } static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, struct vm_area_struct *vma, unsigned long addr, pte_t pte) { u64 frame = 0, flags = 0; struct page *page = NULL; struct folio *folio; if (pte_present(pte)) { if (pm->show_pfn) frame = pte_pfn(pte); flags |= PM_PRESENT; page = vm_normal_page(vma, addr, pte); if (pte_soft_dirty(pte)) flags |= PM_SOFT_DIRTY; if (pte_uffd_wp(pte)) flags |= PM_UFFD_WP; } else if (is_swap_pte(pte)) { swp_entry_t entry; if (pte_swp_soft_dirty(pte)) flags |= PM_SOFT_DIRTY; if (pte_swp_uffd_wp(pte)) flags |= PM_UFFD_WP; entry = pte_to_swp_entry(pte); if (pm->show_pfn) { pgoff_t offset; /* * For PFN swap offsets, keeping the offset field * to be PFN only to be compatible with old smaps. */ if (is_pfn_swap_entry(entry)) offset = swp_offset_pfn(entry); else offset = swp_offset(entry); frame = swp_type(entry) | (offset << MAX_SWAPFILES_SHIFT); } flags |= PM_SWAP; if (is_pfn_swap_entry(entry)) page = pfn_swap_entry_to_page(entry); if (pte_marker_entry_uffd_wp(entry)) flags |= PM_UFFD_WP; } if (page) { folio = page_folio(page); if (!folio_test_anon(folio)) flags |= PM_FILE; if ((flags & PM_PRESENT) && folio_precise_page_mapcount(folio, page) == 1) flags |= PM_MMAP_EXCLUSIVE; } if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; return make_pme(frame, flags); } static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; struct pagemapread *pm = walk->private; spinlock_t *ptl; pte_t *pte, *orig_pte; int err = 0; #ifdef CONFIG_TRANSPARENT_HUGEPAGE ptl = pmd_trans_huge_lock(pmdp, vma); if (ptl) { unsigned int idx = (addr & ~PMD_MASK) >> PAGE_SHIFT; u64 flags = 0, frame = 0; pmd_t pmd = *pmdp; struct page *page = NULL; struct folio *folio = NULL; if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; if (pmd_present(pmd)) { page = pmd_page(pmd); flags |= PM_PRESENT; if (pmd_soft_dirty(pmd)) flags |= PM_SOFT_DIRTY; if (pmd_uffd_wp(pmd)) flags |= PM_UFFD_WP; if (pm->show_pfn) frame = pmd_pfn(pmd) + idx; } #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION else if (is_swap_pmd(pmd)) { swp_entry_t entry = pmd_to_swp_entry(pmd); unsigned long offset; if (pm->show_pfn) { if (is_pfn_swap_entry(entry)) offset = swp_offset_pfn(entry) + idx; else offset = swp_offset(entry) + idx; frame = swp_type(entry) | (offset << MAX_SWAPFILES_SHIFT); } flags |= PM_SWAP; if (pmd_swp_soft_dirty(pmd)) flags |= PM_SOFT_DIRTY; if (pmd_swp_uffd_wp(pmd)) flags |= PM_UFFD_WP; VM_BUG_ON(!is_pmd_migration_entry(pmd)); page = pfn_swap_entry_to_page(entry); } #endif if (page) { folio = page_folio(page); if (!folio_test_anon(folio)) flags |= PM_FILE; } for (; addr != end; addr += PAGE_SIZE, idx++) { unsigned long cur_flags = flags; pagemap_entry_t pme; if (folio && (flags & PM_PRESENT) && folio_precise_page_mapcount(folio, page + idx) == 1) cur_flags |= PM_MMAP_EXCLUSIVE; pme = make_pme(frame, cur_flags); err = add_to_pagemap(&pme, pm); if (err) break; if (pm->show_pfn) { if (flags & PM_PRESENT) frame++; else if (flags & PM_SWAP) frame += (1 << MAX_SWAPFILES_SHIFT); } } spin_unlock(ptl); return err; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* * We can assume that @vma always points to a valid one and @end never * goes beyond vma->vm_end. */ orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl); if (!pte) { walk->action = ACTION_AGAIN; return err; } for (; addr < end; pte++, addr += PAGE_SIZE) { pagemap_entry_t pme; pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte)); err = add_to_pagemap(&pme, pm); if (err) break; } pte_unmap_unlock(orig_pte, ptl); cond_resched(); return err; } #ifdef CONFIG_HUGETLB_PAGE /* This function walks within one hugetlb entry in the single call */ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct pagemapread *pm = walk->private; struct vm_area_struct *vma = walk->vma; u64 flags = 0, frame = 0; int err = 0; pte_t pte; if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; pte = huge_ptep_get(walk->mm, addr, ptep); if (pte_present(pte)) { struct folio *folio = page_folio(pte_page(pte)); if (!folio_test_anon(folio)) flags |= PM_FILE; if (!folio_likely_mapped_shared(folio) && !hugetlb_pmd_shared(ptep)) flags |= PM_MMAP_EXCLUSIVE; if (huge_pte_uffd_wp(pte)) flags |= PM_UFFD_WP; flags |= PM_PRESENT; if (pm->show_pfn) frame = pte_pfn(pte) + ((addr & ~hmask) >> PAGE_SHIFT); } else if (pte_swp_uffd_wp_any(pte)) { flags |= PM_UFFD_WP; } for (; addr != end; addr += PAGE_SIZE) { pagemap_entry_t pme = make_pme(frame, flags); err = add_to_pagemap(&pme, pm); if (err) return err; if (pm->show_pfn && (flags & PM_PRESENT)) frame++; } cond_resched(); return err; } #else #define pagemap_hugetlb_range NULL #endif /* HUGETLB_PAGE */ static const struct mm_walk_ops pagemap_ops = { .pmd_entry = pagemap_pmd_range, .pte_hole = pagemap_pte_hole, .hugetlb_entry = pagemap_hugetlb_range, .walk_lock = PGWALK_RDLOCK, }; /* * /proc/pid/pagemap - an array mapping virtual pages to pfns * * For each page in the address space, this file contains one 64-bit entry * consisting of the following: * * Bits 0-54 page frame number (PFN) if present * Bits 0-4 swap type if swapped * Bits 5-54 swap offset if swapped * Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst) * Bit 56 page exclusively mapped * Bit 57 pte is uffd-wp write-protected * Bits 58-60 zero * Bit 61 page is file-page or shared-anon * Bit 62 page swapped * Bit 63 page present * * If the page is not present but in swap, then the PFN contains an * encoding of the swap file number and the page's offset into the * swap. Unmapped pages return a null PFN. This allows determining * precisely which pages are mapped (or in swap) and comparing mapped * pages between processes. * * Efficient users of this interface will use /proc/pid/maps to * determine which areas of memory are actually mapped and llseek to * skip over unmapped regions. */ static ssize_t pagemap_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct mm_struct *mm = file->private_data; struct pagemapread pm; unsigned long src; unsigned long svpfn; unsigned long start_vaddr; unsigned long end_vaddr; int ret = 0, copied = 0; if (!mm || !mmget_not_zero(mm)) goto out; ret = -EINVAL; /* file position must be aligned */ if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES)) goto out_mm; ret = 0; if (!count) goto out_mm; /* do not disclose physical addresses: attack vector */ pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN); pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); pm.buffer = kmalloc_array(pm.len, PM_ENTRY_BYTES, GFP_KERNEL); ret = -ENOMEM; if (!pm.buffer) goto out_mm; src = *ppos; svpfn = src / PM_ENTRY_BYTES; end_vaddr = mm->task_size; /* watch out for wraparound */ start_vaddr = end_vaddr; if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) { unsigned long end; ret = mmap_read_lock_killable(mm); if (ret) goto out_free; start_vaddr = untagged_addr_remote(mm, svpfn << PAGE_SHIFT); mmap_read_unlock(mm); end = start_vaddr + ((count / PM_ENTRY_BYTES) << PAGE_SHIFT); if (end >= start_vaddr && end < mm->task_size) end_vaddr = end; } /* Ensure the address is inside the task */ if (start_vaddr > mm->task_size) start_vaddr = end_vaddr; ret = 0; while (count && (start_vaddr < end_vaddr)) { int len; unsigned long end; pm.pos = 0; end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK; /* overflow ? */ if (end < start_vaddr || end > end_vaddr) end = end_vaddr; ret = mmap_read_lock_killable(mm); if (ret) goto out_free; ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm); mmap_read_unlock(mm); start_vaddr = end; len = min(count, PM_ENTRY_BYTES * pm.pos); if (copy_to_user(buf, pm.buffer, len)) { ret = -EFAULT; goto out_free; } copied += len; buf += len; count -= len; } *ppos += copied; if (!ret || ret == PM_END_OF_BUFFER) ret = copied; out_free: kfree(pm.buffer); out_mm: mmput(mm); out: return ret; } static int pagemap_open(struct inode *inode, struct file *file) { struct mm_struct *mm; mm = proc_mem_open(inode, PTRACE_MODE_READ); if (IS_ERR(mm)) return PTR_ERR(mm); file->private_data = mm; return 0; } static int pagemap_release(struct inode *inode, struct file *file) { struct mm_struct *mm = file->private_data; if (mm) mmdrop(mm); return 0; } #define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \ PAGE_IS_FILE | PAGE_IS_PRESENT | \ PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \ PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY) #define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC) struct pagemap_scan_private { struct pm_scan_arg arg; unsigned long masks_of_interest, cur_vma_category; struct page_region *vec_buf; unsigned long vec_buf_len, vec_buf_index, found_pages; struct page_region __user *vec_out; }; static unsigned long pagemap_page_category(struct pagemap_scan_private *p, struct vm_area_struct *vma, unsigned long addr, pte_t pte) { unsigned long categories = 0; if (pte_present(pte)) { struct page *page; categories |= PAGE_IS_PRESENT; if (!pte_uffd_wp(pte)) categories |= PAGE_IS_WRITTEN; if (p->masks_of_interest & PAGE_IS_FILE) { page = vm_normal_page(vma, addr, pte); if (page && !PageAnon(page)) categories |= PAGE_IS_FILE; } if (is_zero_pfn(pte_pfn(pte))) categories |= PAGE_IS_PFNZERO; if (pte_soft_dirty(pte)) categories |= PAGE_IS_SOFT_DIRTY; } else if (is_swap_pte(pte)) { swp_entry_t swp; categories |= PAGE_IS_SWAPPED; if (!pte_swp_uffd_wp_any(pte)) categories |= PAGE_IS_WRITTEN; if (p->masks_of_interest & PAGE_IS_FILE) { swp = pte_to_swp_entry(pte); if (is_pfn_swap_entry(swp) && !folio_test_anon(pfn_swap_entry_folio(swp))) categories |= PAGE_IS_FILE; } if (pte_swp_soft_dirty(pte)) categories |= PAGE_IS_SOFT_DIRTY; } return categories; } static void make_uffd_wp_pte(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, pte_t ptent) { if (pte_present(ptent)) { pte_t old_pte; old_pte = ptep_modify_prot_start(vma, addr, pte); ptent = pte_mkuffd_wp(old_pte); ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); } else if (is_swap_pte(ptent)) { ptent = pte_swp_mkuffd_wp(ptent); set_pte_at(vma->vm_mm, addr, pte, ptent); } else { set_pte_at(vma->vm_mm, addr, pte, make_pte_marker(PTE_MARKER_UFFD_WP)); } } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, struct vm_area_struct *vma, unsigned long addr, pmd_t pmd) { unsigned long categories = PAGE_IS_HUGE; if (pmd_present(pmd)) { struct page *page; categories |= PAGE_IS_PRESENT; if (!pmd_uffd_wp(pmd)) categories |= PAGE_IS_WRITTEN; if (p->masks_of_interest & PAGE_IS_FILE) { page = vm_normal_page_pmd(vma, addr, pmd); if (page && !PageAnon(page)) categories |= PAGE_IS_FILE; } if (is_zero_pfn(pmd_pfn(pmd))) categories |= PAGE_IS_PFNZERO; if (pmd_soft_dirty(pmd)) categories |= PAGE_IS_SOFT_DIRTY; } else if (is_swap_pmd(pmd)) { swp_entry_t swp; categories |= PAGE_IS_SWAPPED; if (!pmd_swp_uffd_wp(pmd)) categories |= PAGE_IS_WRITTEN; if (pmd_swp_soft_dirty(pmd)) categories |= PAGE_IS_SOFT_DIRTY; if (p->masks_of_interest & PAGE_IS_FILE) { swp = pmd_to_swp_entry(pmd); if (is_pfn_swap_entry(swp) && !folio_test_anon(pfn_swap_entry_folio(swp))) categories |= PAGE_IS_FILE; } } return categories; } static void make_uffd_wp_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { pmd_t old, pmd = *pmdp; if (pmd_present(pmd)) { old = pmdp_invalidate_ad(vma, addr, pmdp); pmd = pmd_mkuffd_wp(old); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { pmd = pmd_swp_mkuffd_wp(pmd); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_HUGETLB_PAGE static unsigned long pagemap_hugetlb_category(pte_t pte) { unsigned long categories = PAGE_IS_HUGE; /* * According to pagemap_hugetlb_range(), file-backed HugeTLB * page cannot be swapped. So PAGE_IS_FILE is not checked for * swapped pages. */ if (pte_present(pte)) { categories |= PAGE_IS_PRESENT; if (!huge_pte_uffd_wp(pte)) categories |= PAGE_IS_WRITTEN; if (!PageAnon(pte_page(pte))) categories |= PAGE_IS_FILE; if (is_zero_pfn(pte_pfn(pte))) categories |= PAGE_IS_PFNZERO; if (pte_soft_dirty(pte)) categories |= PAGE_IS_SOFT_DIRTY; } else if (is_swap_pte(pte)) { categories |= PAGE_IS_SWAPPED; if (!pte_swp_uffd_wp_any(pte)) categories |= PAGE_IS_WRITTEN; if (pte_swp_soft_dirty(pte)) categories |= PAGE_IS_SOFT_DIRTY; } return categories; } static void make_uffd_wp_huge_pte(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t ptent) { unsigned long psize; if (is_hugetlb_entry_hwpoisoned(ptent) || is_pte_marker(ptent)) return; psize = huge_page_size(hstate_vma(vma)); if (is_hugetlb_entry_migration(ptent)) set_huge_pte_at(vma->vm_mm, addr, ptep, pte_swp_mkuffd_wp(ptent), psize); else if (!huge_pte_none(ptent)) huge_ptep_modify_prot_commit(vma, addr, ptep, ptent, huge_pte_mkuffd_wp(ptent)); else set_huge_pte_at(vma->vm_mm, addr, ptep, make_pte_marker(PTE_MARKER_UFFD_WP), psize); } #endif /* CONFIG_HUGETLB_PAGE */ #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE) static void pagemap_scan_backout_range(struct pagemap_scan_private *p, unsigned long addr, unsigned long end) { struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; if (cur_buf->start != addr) cur_buf->end = addr; else cur_buf->start = cur_buf->end = 0; p->found_pages -= (end - addr) / PAGE_SIZE; } #endif static bool pagemap_scan_is_interesting_page(unsigned long categories, const struct pagemap_scan_private *p) { categories ^= p->arg.category_inverted; if ((categories & p->arg.category_mask) != p->arg.category_mask) return false; if (p->arg.category_anyof_mask && !(categories & p->arg.category_anyof_mask)) return false; return true; } static bool pagemap_scan_is_interesting_vma(unsigned long categories, const struct pagemap_scan_private *p) { unsigned long required = p->arg.category_mask & PAGE_IS_WPALLOWED; categories ^= p->arg.category_inverted; if ((categories & required) != required) return false; return true; } static int pagemap_scan_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk) { struct pagemap_scan_private *p = walk->private; struct vm_area_struct *vma = walk->vma; unsigned long vma_category = 0; bool wp_allowed = userfaultfd_wp_async(vma) && userfaultfd_wp_use_markers(vma); if (!wp_allowed) { /* User requested explicit failure over wp-async capability */ if (p->arg.flags & PM_SCAN_CHECK_WPASYNC) return -EPERM; /* * User requires wr-protect, and allows silently skipping * unsupported vmas. */ if (p->arg.flags & PM_SCAN_WP_MATCHING) return 1; /* * Then the request doesn't involve wr-protects at all, * fall through to the rest checks, and allow vma walk. */ } if (vma->vm_flags & VM_PFNMAP) return 1; if (wp_allowed) vma_category |= PAGE_IS_WPALLOWED; if (vma->vm_flags & VM_SOFTDIRTY) vma_category |= PAGE_IS_SOFT_DIRTY; if (!pagemap_scan_is_interesting_vma(vma_category, p)) return 1; p->cur_vma_category = vma_category; return 0; } static bool pagemap_scan_push_range(unsigned long categories, struct pagemap_scan_private *p, unsigned long addr, unsigned long end) { struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; /* * When there is no output buffer provided at all, the sentinel values * won't match here. There is no other way for `cur_buf->end` to be * non-zero other than it being non-empty. */ if (addr == cur_buf->end && categories == cur_buf->categories) { cur_buf->end = end; return true; } if (cur_buf->end) { if (p->vec_buf_index >= p->vec_buf_len - 1) return false; cur_buf = &p->vec_buf[++p->vec_buf_index]; } cur_buf->start = addr; cur_buf->end = end; cur_buf->categories = categories; return true; } static int pagemap_scan_output(unsigned long categories, struct pagemap_scan_private *p, unsigned long addr, unsigned long *end) { unsigned long n_pages, total_pages; int ret = 0; if (!p->vec_buf) return 0; categories &= p->arg.return_mask; n_pages = (*end - addr) / PAGE_SIZE; if (check_add_overflow(p->found_pages, n_pages, &total_pages) || total_pages > p->arg.max_pages) { size_t n_too_much = total_pages - p->arg.max_pages; *end -= n_too_much * PAGE_SIZE; n_pages -= n_too_much; ret = -ENOSPC; } if (!pagemap_scan_push_range(categories, p, addr, *end)) { *end = addr; n_pages = 0; ret = -ENOSPC; } p->found_pages += n_pages; if (ret) p->arg.walk_end = *end; return ret; } static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start, unsigned long end, struct mm_walk *walk) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct pagemap_scan_private *p = walk->private; struct vm_area_struct *vma = walk->vma; unsigned long categories; spinlock_t *ptl; int ret = 0; ptl = pmd_trans_huge_lock(pmd, vma); if (!ptl) return -ENOENT; categories = p->cur_vma_category | pagemap_thp_category(p, vma, start, *pmd); if (!pagemap_scan_is_interesting_page(categories, p)) goto out_unlock; ret = pagemap_scan_output(categories, p, start, &end); if (start == end) goto out_unlock; if (~p->arg.flags & PM_SCAN_WP_MATCHING) goto out_unlock; if (~categories & PAGE_IS_WRITTEN) goto out_unlock; /* * Break huge page into small pages if the WP operation * needs to be performed on a portion of the huge page. */ if (end != start + HPAGE_SIZE) { spin_unlock(ptl); split_huge_pmd(vma, pmd, start); pagemap_scan_backout_range(p, start, end); /* Report as if there was no THP */ return -ENOENT; } make_uffd_wp_pmd(vma, start, pmd); flush_tlb_range(vma, start, end); out_unlock: spin_unlock(ptl); return ret; #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ return -ENOENT; #endif } static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, unsigned long end, struct mm_walk *walk) { struct pagemap_scan_private *p = walk->private; struct vm_area_struct *vma = walk->vma; unsigned long addr, flush_end = 0; pte_t *pte, *start_pte; spinlock_t *ptl; int ret; arch_enter_lazy_mmu_mode(); ret = pagemap_scan_thp_entry(pmd, start, end, walk); if (ret != -ENOENT) { arch_leave_lazy_mmu_mode(); return ret; } ret = 0; start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); if (!pte) { arch_leave_lazy_mmu_mode(); walk->action = ACTION_AGAIN; return 0; } if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) { /* Fast path for performing exclusive WP */ for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { pte_t ptent = ptep_get(pte); if ((pte_present(ptent) && pte_uffd_wp(ptent)) || pte_swp_uffd_wp_any(ptent)) continue; make_uffd_wp_pte(vma, addr, pte, ptent); if (!flush_end) start = addr; flush_end = addr + PAGE_SIZE; } goto flush_and_return; } if (!p->arg.category_anyof_mask && !p->arg.category_inverted && p->arg.category_mask == PAGE_IS_WRITTEN && p->arg.return_mask == PAGE_IS_WRITTEN) { for (addr = start; addr < end; pte++, addr += PAGE_SIZE) { unsigned long next = addr + PAGE_SIZE; pte_t ptent = ptep_get(pte); if ((pte_present(ptent) && pte_uffd_wp(ptent)) || pte_swp_uffd_wp_any(ptent)) continue; ret = pagemap_scan_output(p->cur_vma_category | PAGE_IS_WRITTEN, p, addr, &next); if (next == addr) break; if (~p->arg.flags & PM_SCAN_WP_MATCHING) continue; make_uffd_wp_pte(vma, addr, pte, ptent); if (!flush_end) start = addr; flush_end = next; } goto flush_and_return; } for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { pte_t ptent = ptep_get(pte); unsigned long categories = p->cur_vma_category | pagemap_page_category(p, vma, addr, ptent); unsigned long next = addr + PAGE_SIZE; if (!pagemap_scan_is_interesting_page(categories, p)) continue; ret = pagemap_scan_output(categories, p, addr, &next); if (next == addr) break; if (~p->arg.flags & PM_SCAN_WP_MATCHING) continue; if (~categories & PAGE_IS_WRITTEN) continue; make_uffd_wp_pte(vma, addr, pte, ptent); if (!flush_end) start = addr; flush_end = next; } flush_and_return: if (flush_end) flush_tlb_range(vma, start, addr); pte_unmap_unlock(start_pte, ptl); arch_leave_lazy_mmu_mode(); cond_resched(); return ret; } #ifdef CONFIG_HUGETLB_PAGE static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask, unsigned long start, unsigned long end, struct mm_walk *walk) { struct pagemap_scan_private *p = walk->private; struct vm_area_struct *vma = walk->vma; unsigned long categories; spinlock_t *ptl; int ret = 0; pte_t pte; if (~p->arg.flags & PM_SCAN_WP_MATCHING) { /* Go the short route when not write-protecting pages. */ pte = huge_ptep_get(walk->mm, start, ptep); categories = p->cur_vma_category | pagemap_hugetlb_category(pte); if (!pagemap_scan_is_interesting_page(categories, p)) return 0; return pagemap_scan_output(categories, p, start, &end); } i_mmap_lock_write(vma->vm_file->f_mapping); ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep); pte = huge_ptep_get(walk->mm, start, ptep); categories = p->cur_vma_category | pagemap_hugetlb_category(pte); if (!pagemap_scan_is_interesting_page(categories, p)) goto out_unlock; ret = pagemap_scan_output(categories, p, start, &end); if (start == end) goto out_unlock; if (~categories & PAGE_IS_WRITTEN) goto out_unlock; if (end != start + HPAGE_SIZE) { /* Partial HugeTLB page WP isn't possible. */ pagemap_scan_backout_range(p, start, end); p->arg.walk_end = start; ret = 0; goto out_unlock; } make_uffd_wp_huge_pte(vma, start, ptep, pte); flush_hugetlb_tlb_range(vma, start, end); out_unlock: spin_unlock(ptl); i_mmap_unlock_write(vma->vm_file->f_mapping); return ret; } #else #define pagemap_scan_hugetlb_entry NULL #endif static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end, int depth, struct mm_walk *walk) { struct pagemap_scan_private *p = walk->private; struct vm_area_struct *vma = walk->vma; int ret, err; if (!vma || !pagemap_scan_is_interesting_page(p->cur_vma_category, p)) return 0; ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end); if (addr == end) return ret; if (~p->arg.flags & PM_SCAN_WP_MATCHING) return ret; err = uffd_wp_range(vma, addr, end - addr, true); if (err < 0) ret = err; return ret; } static const struct mm_walk_ops pagemap_scan_ops = { .test_walk = pagemap_scan_test_walk, .pmd_entry = pagemap_scan_pmd_entry, .pte_hole = pagemap_scan_pte_hole, .hugetlb_entry = pagemap_scan_hugetlb_entry, }; static int pagemap_scan_get_args(struct pm_scan_arg *arg, unsigned long uarg) { if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg))) return -EFAULT; if (arg->size != sizeof(struct pm_scan_arg)) return -EINVAL; /* Validate requested features */ if (arg->flags & ~PM_SCAN_FLAGS) return -EINVAL; if ((arg->category_inverted | arg->category_mask | arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES) return -EINVAL; arg->start = untagged_addr((unsigned long)arg->start); arg->end = untagged_addr((unsigned long)arg->end); arg->vec = untagged_addr((unsigned long)arg->vec); /* Validate memory pointers */ if (!IS_ALIGNED(arg->start, PAGE_SIZE)) return -EINVAL; if (!access_ok((void __user *)(long)arg->start, arg->end - arg->start)) return -EFAULT; if (!arg->vec && arg->vec_len) return -EINVAL; if (UINT_MAX == SIZE_MAX && arg->vec_len > SIZE_MAX) return -EINVAL; if (arg->vec && !access_ok((void __user *)(long)arg->vec, size_mul(arg->vec_len, sizeof(struct page_region)))) return -EFAULT; /* Fixup default values */ arg->end = ALIGN(arg->end, PAGE_SIZE); arg->walk_end = 0; if (!arg->max_pages) arg->max_pages = ULONG_MAX; return 0; } static int pagemap_scan_writeback_args(struct pm_scan_arg *arg, unsigned long uargl) { struct pm_scan_arg __user *uarg = (void __user *)uargl; if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end))) return -EFAULT; return 0; } static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p) { if (!p->arg.vec_len) return 0; p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT, p->arg.vec_len); p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf), GFP_KERNEL); if (!p->vec_buf) return -ENOMEM; p->vec_buf->start = p->vec_buf->end = 0; p->vec_out = (struct page_region __user *)(long)p->arg.vec; return 0; } static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p) { const struct page_region *buf = p->vec_buf; long n = p->vec_buf_index; if (!p->vec_buf) return 0; if (buf[n].end != buf[n].start) n++; if (!n) return 0; if (copy_to_user(p->vec_out, buf, n * sizeof(*buf))) return -EFAULT; p->arg.vec_len -= n; p->vec_out += n; p->vec_buf_index = 0; p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len); p->vec_buf->start = p->vec_buf->end = 0; return n; } static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg) { struct pagemap_scan_private p = {0}; unsigned long walk_start; size_t n_ranges_out = 0; int ret; ret = pagemap_scan_get_args(&p.arg, uarg); if (ret) return ret; p.masks_of_interest = p.arg.category_mask | p.arg.category_anyof_mask | p.arg.return_mask; ret = pagemap_scan_init_bounce_buffer(&p); if (ret) return ret; for (walk_start = p.arg.start; walk_start < p.arg.end; walk_start = p.arg.walk_end) { struct mmu_notifier_range range; long n_out; if (fatal_signal_pending(current)) { ret = -EINTR; break; } ret = mmap_read_lock_killable(mm); if (ret) break; /* Protection change for the range is going to happen. */ if (p.arg.flags & PM_SCAN_WP_MATCHING) { mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0, mm, walk_start, p.arg.end); mmu_notifier_invalidate_range_start(&range); } ret = walk_page_range(mm, walk_start, p.arg.end, &pagemap_scan_ops, &p); if (p.arg.flags & PM_SCAN_WP_MATCHING) mmu_notifier_invalidate_range_end(&range); mmap_read_unlock(mm); n_out = pagemap_scan_flush_buffer(&p); if (n_out < 0) ret = n_out; else n_ranges_out += n_out; if (ret != -ENOSPC) break; if (p.arg.vec_len == 0 || p.found_pages == p.arg.max_pages) break; } /* ENOSPC signifies early stop (buffer full) from the walk. */ if (!ret || ret == -ENOSPC) ret = n_ranges_out; /* The walk_end isn't set when ret is zero */ if (!p.arg.walk_end) p.arg.walk_end = p.arg.end; if (pagemap_scan_writeback_args(&p.arg, uarg)) ret = -EFAULT; kfree(p.vec_buf); return ret; } static long do_pagemap_cmd(struct file *file, unsigned int cmd, unsigned long arg) { struct mm_struct *mm = file->private_data; switch (cmd) { case PAGEMAP_SCAN: return do_pagemap_scan(mm, arg); default: return -EINVAL; } } const struct file_operations proc_pagemap_operations = { .llseek = mem_lseek, /* borrow this */ .read = pagemap_read, .open = pagemap_open, .release = pagemap_release, .unlocked_ioctl = do_pagemap_cmd, .compat_ioctl = do_pagemap_cmd, }; #endif /* CONFIG_PROC_PAGE_MONITOR */ #ifdef CONFIG_NUMA struct numa_maps { unsigned long pages; unsigned long anon; unsigned long active; unsigned long writeback; unsigned long mapcount_max; unsigned long dirty; unsigned long swapcache; unsigned long node[MAX_NUMNODES]; }; struct numa_maps_private { struct proc_maps_private proc_maps; struct numa_maps md; }; static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty, unsigned long nr_pages) { struct folio *folio = page_folio(page); int count = folio_precise_page_mapcount(folio, page); md->pages += nr_pages; if (pte_dirty || folio_test_dirty(folio)) md->dirty += nr_pages; if (folio_test_swapcache(folio)) md->swapcache += nr_pages; if (folio_test_active(folio) || folio_test_unevictable(folio)) md->active += nr_pages; if (folio_test_writeback(folio)) md->writeback += nr_pages; if (folio_test_anon(folio)) md->anon += nr_pages; if (count > md->mapcount_max) md->mapcount_max = count; md->node[folio_nid(folio)] += nr_pages; } static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, unsigned long addr) { struct page *page; int nid; if (!pte_present(pte)) return NULL; page = vm_normal_page(vma, addr, pte); if (!page || is_zone_device_page(page)) return NULL; if (PageReserved(page)) return NULL; nid = page_to_nid(page); if (!node_isset(nid, node_states[N_MEMORY])) return NULL; return page; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static struct page *can_gather_numa_stats_pmd(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr) { struct page *page; int nid; if (!pmd_present(pmd)) return NULL; page = vm_normal_page_pmd(vma, addr, pmd); if (!page) return NULL; if (PageReserved(page)) return NULL; nid = page_to_nid(page); if (!node_isset(nid, node_states[N_MEMORY])) return NULL; return page; } #endif static int gather_pte_stats(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct numa_maps *md = walk->private; struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; pte_t *orig_pte; pte_t *pte; #ifdef CONFIG_TRANSPARENT_HUGEPAGE ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { struct page *page; page = can_gather_numa_stats_pmd(*pmd, vma, addr); if (page) gather_stats(page, md, pmd_dirty(*pmd), HPAGE_PMD_SIZE/PAGE_SIZE); spin_unlock(ptl); return 0; } #endif orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); if (!pte) { walk->action = ACTION_AGAIN; return 0; } do { pte_t ptent = ptep_get(pte); struct page *page = can_gather_numa_stats(ptent, vma, addr); if (!page) continue; gather_stats(page, md, pte_dirty(ptent), 1); } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(orig_pte, ptl); cond_resched(); return 0; } #ifdef CONFIG_HUGETLB_PAGE static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { pte_t huge_pte = huge_ptep_get(walk->mm, addr, pte); struct numa_maps *md; struct page *page; if (!pte_present(huge_pte)) return 0; page = pte_page(huge_pte); md = walk->private; gather_stats(page, md, pte_dirty(huge_pte), 1); return 0; } #else static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { return 0; } #endif static const struct mm_walk_ops show_numa_ops = { .hugetlb_entry = gather_hugetlb_stats, .pmd_entry = gather_pte_stats, .walk_lock = PGWALK_RDLOCK, }; /* * Display pages allocated per node and memory policy via /proc. */ static int show_numa_map(struct seq_file *m, void *v) { struct numa_maps_private *numa_priv = m->private; struct proc_maps_private *proc_priv = &numa_priv->proc_maps; struct vm_area_struct *vma = v; struct numa_maps *md = &numa_priv->md; struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; char buffer[64]; struct mempolicy *pol; pgoff_t ilx; int nid; if (!mm) return 0; /* Ensure we start with an empty set of numa_maps statistics. */ memset(md, 0, sizeof(*md)); pol = __get_vma_policy(vma, vma->vm_start, &ilx); if (pol) { mpol_to_str(buffer, sizeof(buffer), pol); mpol_cond_put(pol); } else { mpol_to_str(buffer, sizeof(buffer), proc_priv->task_mempolicy); } seq_printf(m, "%08lx %s", vma->vm_start, buffer); if (file) { seq_puts(m, " file="); seq_path(m, file_user_path(file), "\n\t= "); } else if (vma_is_initial_heap(vma)) { seq_puts(m, " heap"); } else if (vma_is_initial_stack(vma)) { seq_puts(m, " stack"); } if (is_vm_hugetlb_page(vma)) seq_puts(m, " huge"); /* mmap_lock is held by m_start */ walk_page_vma(vma, &show_numa_ops, md); if (!md->pages) goto out; if (md->anon) seq_printf(m, " anon=%lu", md->anon); if (md->dirty) seq_printf(m, " dirty=%lu", md->dirty); if (md->pages != md->anon && md->pages != md->dirty) seq_printf(m, " mapped=%lu", md->pages); if (md->mapcount_max > 1) seq_printf(m, " mapmax=%lu", md->mapcount_max); if (md->swapcache) seq_printf(m, " swapcache=%lu", md->swapcache); if (md->active < md->pages && !is_vm_hugetlb_page(vma)) seq_printf(m, " active=%lu", md->active); if (md->writeback) seq_printf(m, " writeback=%lu", md->writeback); for_each_node_state(nid, N_MEMORY) if (md->node[nid]) seq_printf(m, " N%d=%lu", nid, md->node[nid]); seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10); out: seq_putc(m, '\n'); return 0; } static const struct seq_operations proc_pid_numa_maps_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = show_numa_map, }; static int pid_numa_maps_open(struct inode *inode, struct file *file) { return proc_maps_open(inode, file, &proc_pid_numa_maps_op, sizeof(struct numa_maps_private)); } const struct file_operations proc_pid_numa_maps_operations = { .open = pid_numa_maps_open, .read = seq_read, .llseek = seq_lseek, .release = proc_map_release, }; #endif /* CONFIG_NUMA */ |
| 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH * Copyright (C) 2018-2023 Intel Corporation */ #include <linux/module.h> #include <linux/init.h> #include <linux/etherdevice.h> #include <linux/netdevice.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/skbuff.h> #include <linux/if_arp.h> #include <linux/timer.h> #include <linux/rtnetlink.h> #include <net/codel.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "rate.h" #include "sta_info.h" #include "debugfs_sta.h" #include "mesh.h" #include "wme.h" /** * DOC: STA information lifetime rules * * STA info structures (&struct sta_info) are managed in a hash table * for faster lookup and a list for iteration. They are managed using * RCU, i.e. access to the list and hash table is protected by RCU. * * Upon allocating a STA info structure with sta_info_alloc(), the caller * owns that structure. It must then insert it into the hash table using * either sta_info_insert() or sta_info_insert_rcu(); only in the latter * case (which acquires an rcu read section but must not be called from * within one) will the pointer still be valid after the call. Note that * the caller may not do much with the STA info before inserting it; in * particular, it may not start any mesh peer link management or add * encryption keys. * * When the insertion fails (sta_info_insert()) returns non-zero), the * structure will have been freed by sta_info_insert()! * * Station entries are added by mac80211 when you establish a link with a * peer. This means different things for the different type of interfaces * we support. For a regular station this mean we add the AP sta when we * receive an association response from the AP. For IBSS this occurs when * get to know about a peer on the same IBSS. For WDS we add the sta for * the peer immediately upon device open. When using AP mode we add stations * for each respective station upon request from userspace through nl80211. * * In order to remove a STA info structure, various sta_info_destroy_*() * calls are available. * * There is no concept of ownership on a STA entry; each structure is * owned by the global hash table/list until it is removed. All users of * the structure need to be RCU protected so that the structure won't be * freed before they are done using it. */ struct sta_link_alloc { struct link_sta_info info; struct ieee80211_link_sta sta; struct rcu_head rcu_head; }; static const struct rhashtable_params sta_rht_params = { .nelem_hint = 3, /* start small */ .automatic_shrinking = true, .head_offset = offsetof(struct sta_info, hash_node), .key_offset = offsetof(struct sta_info, addr), .key_len = ETH_ALEN, .max_size = CONFIG_MAC80211_STA_HASH_MAX_SIZE, }; static const struct rhashtable_params link_sta_rht_params = { .nelem_hint = 3, /* start small */ .automatic_shrinking = true, .head_offset = offsetof(struct link_sta_info, link_hash_node), .key_offset = offsetof(struct link_sta_info, addr), .key_len = ETH_ALEN, .max_size = CONFIG_MAC80211_STA_HASH_MAX_SIZE, }; static int sta_info_hash_del(struct ieee80211_local *local, struct sta_info *sta) { return rhltable_remove(&local->sta_hash, &sta->hash_node, sta_rht_params); } static int link_sta_info_hash_add(struct ieee80211_local *local, struct link_sta_info *link_sta) { lockdep_assert_wiphy(local->hw.wiphy); return rhltable_insert(&local->link_sta_hash, &link_sta->link_hash_node, link_sta_rht_params); } static int link_sta_info_hash_del(struct ieee80211_local *local, struct link_sta_info *link_sta) { lockdep_assert_wiphy(local->hw.wiphy); return rhltable_remove(&local->link_sta_hash, &link_sta->link_hash_node, link_sta_rht_params); } void ieee80211_purge_sta_txqs(struct sta_info *sta) { struct ieee80211_local *local = sta->sdata->local; int i; for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { struct txq_info *txqi; if (!sta->sta.txq[i]) continue; txqi = to_txq_info(sta->sta.txq[i]); ieee80211_txq_purge(local, txqi); } } static void __cleanup_single_sta(struct sta_info *sta) { int ac, i; struct tid_ampdu_tx *tid_tx; struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct ps_data *ps; if (test_sta_flag(sta, WLAN_STA_PS_STA) || test_sta_flag(sta, WLAN_STA_PS_DRIVER) || test_sta_flag(sta, WLAN_STA_PS_DELIVER)) { if (sta->sdata->vif.type == NL80211_IFTYPE_AP || sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) ps = &sdata->bss->ps; else if (ieee80211_vif_is_mesh(&sdata->vif)) ps = &sdata->u.mesh.ps; else return; clear_sta_flag(sta, WLAN_STA_PS_STA); clear_sta_flag(sta, WLAN_STA_PS_DRIVER); clear_sta_flag(sta, WLAN_STA_PS_DELIVER); atomic_dec(&ps->num_sta_ps); } ieee80211_purge_sta_txqs(sta); for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { local->total_ps_buffered -= skb_queue_len(&sta->ps_tx_buf[ac]); ieee80211_purge_tx_queue(&local->hw, &sta->ps_tx_buf[ac]); ieee80211_purge_tx_queue(&local->hw, &sta->tx_filtered[ac]); } if (ieee80211_vif_is_mesh(&sdata->vif)) mesh_sta_cleanup(sta); cancel_work_sync(&sta->drv_deliver_wk); /* * Destroy aggregation state here. It would be nice to wait for the * driver to finish aggregation stop and then clean up, but for now * drivers have to handle aggregation stop being requested, followed * directly by station destruction. */ for (i = 0; i < IEEE80211_NUM_TIDS; i++) { kfree(sta->ampdu_mlme.tid_start_tx[i]); tid_tx = rcu_dereference_raw(sta->ampdu_mlme.tid_tx[i]); if (!tid_tx) continue; ieee80211_purge_tx_queue(&local->hw, &tid_tx->pending); kfree(tid_tx); } } static void cleanup_single_sta(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; __cleanup_single_sta(sta); sta_info_free(local, sta); } struct rhlist_head *sta_info_hash_lookup(struct ieee80211_local *local, const u8 *addr) { return rhltable_lookup(&local->sta_hash, addr, sta_rht_params); } /* protected by RCU */ struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct ieee80211_local *local = sdata->local; struct rhlist_head *tmp; struct sta_info *sta; rcu_read_lock(); for_each_sta_info(local, addr, sta, tmp) { if (sta->sdata == sdata) { rcu_read_unlock(); /* this is safe as the caller must already hold * another rcu read section or the mutex */ return sta; } } rcu_read_unlock(); return NULL; } /* * Get sta info either from the specified interface * or from one of its vlans */ struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct ieee80211_local *local = sdata->local; struct rhlist_head *tmp; struct sta_info *sta; rcu_read_lock(); for_each_sta_info(local, addr, sta, tmp) { if (sta->sdata == sdata || (sta->sdata->bss && sta->sdata->bss == sdata->bss)) { rcu_read_unlock(); /* this is safe as the caller must already hold * another rcu read section or the mutex */ return sta; } } rcu_read_unlock(); return NULL; } struct rhlist_head *link_sta_info_hash_lookup(struct ieee80211_local *local, const u8 *addr) { return rhltable_lookup(&local->link_sta_hash, addr, link_sta_rht_params); } struct link_sta_info * link_sta_info_get_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct ieee80211_local *local = sdata->local; struct rhlist_head *tmp; struct link_sta_info *link_sta; rcu_read_lock(); for_each_link_sta_info(local, addr, link_sta, tmp) { struct sta_info *sta = link_sta->sta; if (sta->sdata == sdata || (sta->sdata->bss && sta->sdata->bss == sdata->bss)) { rcu_read_unlock(); /* this is safe as the caller must already hold * another rcu read section or the mutex */ return link_sta; } } rcu_read_unlock(); return NULL; } struct ieee80211_sta * ieee80211_find_sta_by_link_addrs(struct ieee80211_hw *hw, const u8 *addr, const u8 *localaddr, unsigned int *link_id) { struct ieee80211_local *local = hw_to_local(hw); struct link_sta_info *link_sta; struct rhlist_head *tmp; for_each_link_sta_info(local, addr, link_sta, tmp) { struct sta_info *sta = link_sta->sta; struct ieee80211_link_data *link; u8 _link_id = link_sta->link_id; if (!localaddr) { if (link_id) *link_id = _link_id; return &sta->sta; } link = rcu_dereference(sta->sdata->link[_link_id]); if (!link) continue; if (memcmp(link->conf->addr, localaddr, ETH_ALEN)) continue; if (link_id) *link_id = _link_id; return &sta->sta; } return NULL; } EXPORT_SYMBOL_GPL(ieee80211_find_sta_by_link_addrs); struct sta_info *sta_info_get_by_addrs(struct ieee80211_local *local, const u8 *sta_addr, const u8 *vif_addr) { struct rhlist_head *tmp; struct sta_info *sta; for_each_sta_info(local, sta_addr, sta, tmp) { if (ether_addr_equal(vif_addr, sta->sdata->vif.addr)) return sta; } return NULL; } struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata, int idx) { struct ieee80211_local *local = sdata->local; struct sta_info *sta; int i = 0; list_for_each_entry_rcu(sta, &local->sta_list, list, lockdep_is_held(&local->hw.wiphy->mtx)) { if (sdata != sta->sdata) continue; if (i < idx) { ++i; continue; } return sta; } return NULL; } static void sta_info_free_link(struct link_sta_info *link_sta) { free_percpu(link_sta->pcpu_rx_stats); } static void sta_remove_link(struct sta_info *sta, unsigned int link_id, bool unhash) { struct sta_link_alloc *alloc = NULL; struct link_sta_info *link_sta; lockdep_assert_wiphy(sta->local->hw.wiphy); link_sta = rcu_access_pointer(sta->link[link_id]); if (WARN_ON(!link_sta)) return; if (unhash) link_sta_info_hash_del(sta->local, link_sta); if (test_sta_flag(sta, WLAN_STA_INSERTED)) ieee80211_link_sta_debugfs_remove(link_sta); if (link_sta != &sta->deflink) alloc = container_of(link_sta, typeof(*alloc), info); sta->sta.valid_links &= ~BIT(link_id); RCU_INIT_POINTER(sta->link[link_id], NULL); RCU_INIT_POINTER(sta->sta.link[link_id], NULL); if (alloc) { sta_info_free_link(&alloc->info); kfree_rcu(alloc, rcu_head); } ieee80211_sta_recalc_aggregates(&sta->sta); } /** * sta_info_free - free STA * * @local: pointer to the global information * @sta: STA info to free * * This function must undo everything done by sta_info_alloc() * that may happen before sta_info_insert(). It may only be * called when sta_info_insert() has not been attempted (and * if that fails, the station is freed anyway.) */ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta) { int i; for (i = 0; i < ARRAY_SIZE(sta->link); i++) { struct link_sta_info *link_sta; link_sta = rcu_access_pointer(sta->link[i]); if (!link_sta) continue; sta_remove_link(sta, i, false); } /* * If we had used sta_info_pre_move_state() then we might not * have gone through the state transitions down again, so do * it here now (and warn if it's inserted). * * This will clear state such as fast TX/RX that may have been * allocated during state transitions. */ while (sta->sta_state > IEEE80211_STA_NONE) { int ret; WARN_ON_ONCE(test_sta_flag(sta, WLAN_STA_INSERTED)); ret = sta_info_move_state(sta, sta->sta_state - 1); if (WARN_ONCE(ret, "sta_info_move_state() returned %d\n", ret)) break; } if (sta->rate_ctrl) rate_control_free_sta(sta); sta_dbg(sta->sdata, "Destroyed STA %pM\n", sta->sta.addr); kfree(to_txq_info(sta->sta.txq[0])); kfree(rcu_dereference_raw(sta->sta.rates)); #ifdef CONFIG_MAC80211_MESH kfree(sta->mesh); #endif sta_info_free_link(&sta->deflink); kfree(sta); } static int sta_info_hash_add(struct ieee80211_local *local, struct sta_info *sta) { return rhltable_insert(&local->sta_hash, &sta->hash_node, sta_rht_params); } static void sta_deliver_ps_frames(struct work_struct *wk) { struct sta_info *sta; sta = container_of(wk, struct sta_info, drv_deliver_wk); if (sta->dead) return; local_bh_disable(); if (!test_sta_flag(sta, WLAN_STA_PS_STA)) ieee80211_sta_ps_deliver_wakeup(sta); else if (test_and_clear_sta_flag(sta, WLAN_STA_PSPOLL)) ieee80211_sta_ps_deliver_poll_response(sta); else if (test_and_clear_sta_flag(sta, WLAN_STA_UAPSD)) ieee80211_sta_ps_deliver_uapsd(sta); local_bh_enable(); } static int sta_prepare_rate_control(struct ieee80211_local *local, struct sta_info *sta, gfp_t gfp) { if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) return 0; sta->rate_ctrl = local->rate_ctrl; sta->rate_ctrl_priv = rate_control_alloc_sta(sta->rate_ctrl, sta, gfp); if (!sta->rate_ctrl_priv) return -ENOMEM; return 0; } static int sta_info_alloc_link(struct ieee80211_local *local, struct link_sta_info *link_info, gfp_t gfp) { struct ieee80211_hw *hw = &local->hw; int i; if (ieee80211_hw_check(hw, USES_RSS)) { link_info->pcpu_rx_stats = alloc_percpu_gfp(struct ieee80211_sta_rx_stats, gfp); if (!link_info->pcpu_rx_stats) return -ENOMEM; } link_info->rx_stats.last_rx = jiffies; u64_stats_init(&link_info->rx_stats.syncp); ewma_signal_init(&link_info->rx_stats_avg.signal); ewma_avg_signal_init(&link_info->status_stats.avg_ack_signal); for (i = 0; i < ARRAY_SIZE(link_info->rx_stats_avg.chain_signal); i++) ewma_signal_init(&link_info->rx_stats_avg.chain_signal[i]); return 0; } static void sta_info_add_link(struct sta_info *sta, unsigned int link_id, struct link_sta_info *link_info, struct ieee80211_link_sta *link_sta) { link_info->sta = sta; link_info->link_id = link_id; link_info->pub = link_sta; link_info->pub->sta = &sta->sta; link_sta->link_id = link_id; rcu_assign_pointer(sta->link[link_id], link_info); rcu_assign_pointer(sta->sta.link[link_id], link_sta); link_sta->smps_mode = IEEE80211_SMPS_OFF; link_sta->agg.max_rc_amsdu_len = IEEE80211_MAX_MPDU_LEN_HT_BA; } static struct sta_info * __sta_info_alloc(struct ieee80211_sub_if_data *sdata, const u8 *addr, int link_id, const u8 *link_addr, gfp_t gfp) { struct ieee80211_local *local = sdata->local; struct ieee80211_hw *hw = &local->hw; struct sta_info *sta; void *txq_data; int size; int i; sta = kzalloc(sizeof(*sta) + hw->sta_data_size, gfp); if (!sta) return NULL; sta->local = local; sta->sdata = sdata; if (sta_info_alloc_link(local, &sta->deflink, gfp)) goto free; if (link_id >= 0) { sta_info_add_link(sta, link_id, &sta->deflink, &sta->sta.deflink); sta->sta.valid_links = BIT(link_id); } else { sta_info_add_link(sta, 0, &sta->deflink, &sta->sta.deflink); } sta->sta.cur = &sta->sta.deflink.agg; spin_lock_init(&sta->lock); spin_lock_init(&sta->ps_lock); INIT_WORK(&sta->drv_deliver_wk, sta_deliver_ps_frames); wiphy_work_init(&sta->ampdu_mlme.work, ieee80211_ba_session_work); #ifdef CONFIG_MAC80211_MESH if (ieee80211_vif_is_mesh(&sdata->vif)) { sta->mesh = kzalloc(sizeof(*sta->mesh), gfp); if (!sta->mesh) goto free; sta->mesh->plink_sta = sta; spin_lock_init(&sta->mesh->plink_lock); if (!sdata->u.mesh.user_mpm) timer_setup(&sta->mesh->plink_timer, mesh_plink_timer, 0); sta->mesh->nonpeer_pm = NL80211_MESH_POWER_ACTIVE; } #endif memcpy(sta->addr, addr, ETH_ALEN); memcpy(sta->sta.addr, addr, ETH_ALEN); memcpy(sta->deflink.addr, link_addr, ETH_ALEN); memcpy(sta->sta.deflink.addr, link_addr, ETH_ALEN); sta->sta.max_rx_aggregation_subframes = local->hw.max_rx_aggregation_subframes; /* TODO link specific alloc and assignments for MLO Link STA */ /* Extended Key ID needs to install keys for keyid 0 and 1 Rx-only. * The Tx path starts to use a key as soon as the key slot ptk_idx * references to is not NULL. To not use the initial Rx-only key * prematurely for Tx initialize ptk_idx to an impossible PTK keyid * which always will refer to a NULL key. */ BUILD_BUG_ON(ARRAY_SIZE(sta->ptk) <= INVALID_PTK_KEYIDX); sta->ptk_idx = INVALID_PTK_KEYIDX; ieee80211_init_frag_cache(&sta->frags); sta->sta_state = IEEE80211_STA_NONE; if (sdata->vif.type == NL80211_IFTYPE_MESH_POINT) sta->amsdu_mesh_control = -1; /* Mark TID as unreserved */ sta->reserved_tid = IEEE80211_TID_UNRESERVED; sta->last_connected = ktime_get_seconds(); size = sizeof(struct txq_info) + ALIGN(hw->txq_data_size, sizeof(void *)); txq_data = kcalloc(ARRAY_SIZE(sta->sta.txq), size, gfp); if (!txq_data) goto free; for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { struct txq_info *txq = txq_data + i * size; /* might not do anything for the (bufferable) MMPDU TXQ */ ieee80211_txq_init(sdata, sta, txq, i); } if (sta_prepare_rate_control(local, sta, gfp)) goto free_txq; sta->airtime_weight = IEEE80211_DEFAULT_AIRTIME_WEIGHT; for (i = 0; i < IEEE80211_NUM_ACS; i++) { skb_queue_head_init(&sta->ps_tx_buf[i]); skb_queue_head_init(&sta->tx_filtered[i]); sta->airtime[i].deficit = sta->airtime_weight; atomic_set(&sta->airtime[i].aql_tx_pending, 0); sta->airtime[i].aql_limit_low = local->aql_txq_limit_low[i]; sta->airtime[i].aql_limit_high = local->aql_txq_limit_high[i]; } for (i = 0; i < IEEE80211_NUM_TIDS; i++) sta->last_seq_ctrl[i] = cpu_to_le16(USHRT_MAX); for (i = 0; i < NUM_NL80211_BANDS; i++) { u32 mandatory = 0; int r; if (!hw->wiphy->bands[i]) continue; switch (i) { case NL80211_BAND_2GHZ: case NL80211_BAND_LC: /* * We use both here, even if we cannot really know for * sure the station will support both, but the only use * for this is when we don't know anything yet and send * management frames, and then we'll pick the lowest * possible rate anyway. * If we don't include _G here, we cannot find a rate * in P2P, and thus trigger the WARN_ONCE() in rate.c */ mandatory = IEEE80211_RATE_MANDATORY_B | IEEE80211_RATE_MANDATORY_G; break; case NL80211_BAND_5GHZ: mandatory = IEEE80211_RATE_MANDATORY_A; break; case NL80211_BAND_60GHZ: WARN_ON(1); mandatory = 0; break; } for (r = 0; r < hw->wiphy->bands[i]->n_bitrates; r++) { struct ieee80211_rate *rate; rate = &hw->wiphy->bands[i]->bitrates[r]; if (!(rate->flags & mandatory)) continue; sta->sta.deflink.supp_rates[i] |= BIT(r); } } sta->cparams.ce_threshold = CODEL_DISABLED_THRESHOLD; sta->cparams.target = MS2TIME(20); sta->cparams.interval = MS2TIME(100); sta->cparams.ecn = true; sta->cparams.ce_threshold_selector = 0; sta->cparams.ce_threshold_mask = 0; sta_dbg(sdata, "Allocated STA %pM\n", sta->sta.addr); return sta; free_txq: kfree(to_txq_info(sta->sta.txq[0])); free: sta_info_free_link(&sta->deflink); #ifdef CONFIG_MAC80211_MESH kfree(sta->mesh); #endif kfree(sta); return NULL; } struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, const u8 *addr, gfp_t gfp) { return __sta_info_alloc(sdata, addr, -1, addr, gfp); } struct sta_info *sta_info_alloc_with_link(struct ieee80211_sub_if_data *sdata, const u8 *mld_addr, unsigned int link_id, const u8 *link_addr, gfp_t gfp) { return __sta_info_alloc(sdata, mld_addr, link_id, link_addr, gfp); } static int sta_info_insert_check(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; lockdep_assert_wiphy(sdata->local->hw.wiphy); /* * Can't be a WARN_ON because it can be triggered through a race: * something inserts a STA (on one CPU) without holding the RTNL * and another CPU turns off the net device. */ if (unlikely(!ieee80211_sdata_running(sdata))) return -ENETDOWN; if (WARN_ON(ether_addr_equal(sta->sta.addr, sdata->vif.addr) || !is_valid_ether_addr(sta->sta.addr))) return -EINVAL; /* The RCU read lock is required by rhashtable due to * asynchronous resize/rehash. We also require the mutex * for correctness. */ rcu_read_lock(); if (ieee80211_hw_check(&sdata->local->hw, NEEDS_UNIQUE_STA_ADDR) && ieee80211_find_sta_by_ifaddr(&sdata->local->hw, sta->addr, NULL)) { rcu_read_unlock(); return -ENOTUNIQ; } rcu_read_unlock(); return 0; } static int sta_info_insert_drv_state(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta) { enum ieee80211_sta_state state; int err = 0; for (state = IEEE80211_STA_NOTEXIST; state < sta->sta_state; state++) { err = drv_sta_state(local, sdata, sta, state, state + 1); if (err) break; } if (!err) { /* * Drivers using legacy sta_add/sta_remove callbacks only * get uploaded set to true after sta_add is called. */ if (!local->ops->sta_add) sta->uploaded = true; return 0; } if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { sdata_info(sdata, "failed to move IBSS STA %pM to state %d (%d) - keeping it anyway\n", sta->sta.addr, state + 1, err); err = 0; } /* unwind on error */ for (; state > IEEE80211_STA_NOTEXIST; state--) WARN_ON(drv_sta_state(local, sdata, sta, state, state - 1)); return err; } static void ieee80211_recalc_p2p_go_ps_allowed(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; bool allow_p2p_go_ps = sdata->vif.p2p; struct sta_info *sta; rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) { if (sdata != sta->sdata || !test_sta_flag(sta, WLAN_STA_ASSOC)) continue; if (!sta->sta.support_p2p_ps) { allow_p2p_go_ps = false; break; } } rcu_read_unlock(); if (allow_p2p_go_ps != sdata->vif.bss_conf.allow_p2p_go_ps) { sdata->vif.bss_conf.allow_p2p_go_ps = allow_p2p_go_ps; ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_P2P_PS); } } static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU) { struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; struct station_info *sinfo = NULL; int err = 0; lockdep_assert_wiphy(local->hw.wiphy); /* check if STA exists already */ if (sta_info_get_bss(sdata, sta->sta.addr)) { err = -EEXIST; goto out_cleanup; } sinfo = kzalloc(sizeof(struct station_info), GFP_KERNEL); if (!sinfo) { err = -ENOMEM; goto out_cleanup; } local->num_sta++; local->sta_generation++; smp_mb(); /* simplify things and don't accept BA sessions yet */ set_sta_flag(sta, WLAN_STA_BLOCK_BA); /* make the station visible */ err = sta_info_hash_add(local, sta); if (err) goto out_drop_sta; if (sta->sta.valid_links) { err = link_sta_info_hash_add(local, &sta->deflink); if (err) { sta_info_hash_del(local, sta); goto out_drop_sta; } } list_add_tail_rcu(&sta->list, &local->sta_list); /* update channel context before notifying the driver about state * change, this enables driver using the updated channel context right away. */ if (sta->sta_state >= IEEE80211_STA_ASSOC) { ieee80211_recalc_min_chandef(sta->sdata, -1); if (!sta->sta.support_p2p_ps) ieee80211_recalc_p2p_go_ps_allowed(sta->sdata); } /* notify driver */ err = sta_info_insert_drv_state(local, sdata, sta); if (err) goto out_remove; set_sta_flag(sta, WLAN_STA_INSERTED); /* accept BA sessions now */ clear_sta_flag(sta, WLAN_STA_BLOCK_BA); ieee80211_sta_debugfs_add(sta); rate_control_add_sta_debugfs(sta); if (sta->sta.valid_links) { int i; for (i = 0; i < ARRAY_SIZE(sta->link); i++) { struct link_sta_info *link_sta; link_sta = rcu_dereference_protected(sta->link[i], lockdep_is_held(&local->hw.wiphy->mtx)); if (!link_sta) continue; ieee80211_link_sta_debugfs_add(link_sta); if (sdata->vif.active_links & BIT(i)) ieee80211_link_sta_debugfs_drv_add(link_sta); } } else { ieee80211_link_sta_debugfs_add(&sta->deflink); ieee80211_link_sta_debugfs_drv_add(&sta->deflink); } sinfo->generation = local->sta_generation; cfg80211_new_sta(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL); kfree(sinfo); sta_dbg(sdata, "Inserted STA %pM\n", sta->sta.addr); /* move reference to rcu-protected */ rcu_read_lock(); if (ieee80211_vif_is_mesh(&sdata->vif)) mesh_accept_plinks_update(sdata); ieee80211_check_fast_xmit(sta); return 0; out_remove: if (sta->sta.valid_links) link_sta_info_hash_del(local, &sta->deflink); sta_info_hash_del(local, sta); list_del_rcu(&sta->list); out_drop_sta: local->num_sta--; synchronize_net(); out_cleanup: cleanup_single_sta(sta); kfree(sinfo); rcu_read_lock(); return err; } int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU) { struct ieee80211_local *local = sta->local; int err; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); err = sta_info_insert_check(sta); if (err) { sta_info_free(local, sta); rcu_read_lock(); return err; } return sta_info_insert_finish(sta); } int sta_info_insert(struct sta_info *sta) { int err = sta_info_insert_rcu(sta); rcu_read_unlock(); return err; } static inline void __bss_tim_set(u8 *tim, u16 id) { /* * This format has been mandated by the IEEE specifications, * so this line may not be changed to use the __set_bit() format. */ tim[id / 8] |= (1 << (id % 8)); } static inline void __bss_tim_clear(u8 *tim, u16 id) { /* * This format has been mandated by the IEEE specifications, * so this line may not be changed to use the __clear_bit() format. */ tim[id / 8] &= ~(1 << (id % 8)); } static inline bool __bss_tim_get(u8 *tim, u16 id) { /* * This format has been mandated by the IEEE specifications, * so this line may not be changed to use the test_bit() format. */ return tim[id / 8] & (1 << (id % 8)); } static unsigned long ieee80211_tids_for_ac(int ac) { /* If we ever support TIDs > 7, this obviously needs to be adjusted */ switch (ac) { case IEEE80211_AC_VO: return BIT(6) | BIT(7); case IEEE80211_AC_VI: return BIT(4) | BIT(5); case IEEE80211_AC_BE: return BIT(0) | BIT(3); case IEEE80211_AC_BK: return BIT(1) | BIT(2); default: WARN_ON(1); return 0; } } static void __sta_info_recalc_tim(struct sta_info *sta, bool ignore_pending) { struct ieee80211_local *local = sta->local; struct ps_data *ps; bool indicate_tim = false; u8 ignore_for_tim = sta->sta.uapsd_queues; int ac; u16 id = sta->sta.aid; if (sta->sdata->vif.type == NL80211_IFTYPE_AP || sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { if (WARN_ON_ONCE(!sta->sdata->bss)) return; ps = &sta->sdata->bss->ps; #ifdef CONFIG_MAC80211_MESH } else if (ieee80211_vif_is_mesh(&sta->sdata->vif)) { ps = &sta->sdata->u.mesh.ps; #endif } else { return; } /* No need to do anything if the driver does all */ if (ieee80211_hw_check(&local->hw, AP_LINK_PS) && !local->ops->set_tim) return; if (sta->dead) goto done; /* * If all ACs are delivery-enabled then we should build * the TIM bit for all ACs anyway; if only some are then * we ignore those and build the TIM bit using only the * non-enabled ones. */ if (ignore_for_tim == BIT(IEEE80211_NUM_ACS) - 1) ignore_for_tim = 0; if (ignore_pending) ignore_for_tim = BIT(IEEE80211_NUM_ACS) - 1; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { unsigned long tids; if (ignore_for_tim & ieee80211_ac_to_qos_mask[ac]) continue; indicate_tim |= !skb_queue_empty(&sta->tx_filtered[ac]) || !skb_queue_empty(&sta->ps_tx_buf[ac]); if (indicate_tim) break; tids = ieee80211_tids_for_ac(ac); indicate_tim |= sta->driver_buffered_tids & tids; indicate_tim |= sta->txq_buffered_tids & tids; } done: spin_lock_bh(&local->tim_lock); if (indicate_tim == __bss_tim_get(ps->tim, id)) goto out_unlock; if (indicate_tim) __bss_tim_set(ps->tim, id); else __bss_tim_clear(ps->tim, id); if (local->ops->set_tim && !WARN_ON(sta->dead)) { local->tim_in_locked_section = true; drv_set_tim(local, &sta->sta, indicate_tim); local->tim_in_locked_section = false; } out_unlock: spin_unlock_bh(&local->tim_lock); } void sta_info_recalc_tim(struct sta_info *sta) { __sta_info_recalc_tim(sta, false); } static bool sta_info_buffer_expired(struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_tx_info *info; int timeout; if (!skb) return false; info = IEEE80211_SKB_CB(skb); /* Timeout: (2 * listen_interval * beacon_int * 1024 / 1000000) sec */ timeout = (sta->listen_interval * sta->sdata->vif.bss_conf.beacon_int * 32 / 15625) * HZ; if (timeout < STA_TX_BUFFER_EXPIRE) timeout = STA_TX_BUFFER_EXPIRE; return time_after(jiffies, info->control.jiffies + timeout); } static bool sta_info_cleanup_expire_buffered_ac(struct ieee80211_local *local, struct sta_info *sta, int ac) { unsigned long flags; struct sk_buff *skb; /* * First check for frames that should expire on the filtered * queue. Frames here were rejected by the driver and are on * a separate queue to avoid reordering with normal PS-buffered * frames. They also aren't accounted for right now in the * total_ps_buffered counter. */ for (;;) { spin_lock_irqsave(&sta->tx_filtered[ac].lock, flags); skb = skb_peek(&sta->tx_filtered[ac]); if (sta_info_buffer_expired(sta, skb)) skb = __skb_dequeue(&sta->tx_filtered[ac]); else skb = NULL; spin_unlock_irqrestore(&sta->tx_filtered[ac].lock, flags); /* * Frames are queued in order, so if this one * hasn't expired yet we can stop testing. If * we actually reached the end of the queue we * also need to stop, of course. */ if (!skb) break; ieee80211_free_txskb(&local->hw, skb); } /* * Now also check the normal PS-buffered queue, this will * only find something if the filtered queue was emptied * since the filtered frames are all before the normal PS * buffered frames. */ for (;;) { spin_lock_irqsave(&sta->ps_tx_buf[ac].lock, flags); skb = skb_peek(&sta->ps_tx_buf[ac]); if (sta_info_buffer_expired(sta, skb)) skb = __skb_dequeue(&sta->ps_tx_buf[ac]); else skb = NULL; spin_unlock_irqrestore(&sta->ps_tx_buf[ac].lock, flags); /* * frames are queued in order, so if this one * hasn't expired yet (or we reached the end of * the queue) we can stop testing */ if (!skb) break; local->total_ps_buffered--; ps_dbg(sta->sdata, "Buffered frame expired (STA %pM)\n", sta->sta.addr); ieee80211_free_txskb(&local->hw, skb); } /* * Finally, recalculate the TIM bit for this station -- it might * now be clear because the station was too slow to retrieve its * frames. */ sta_info_recalc_tim(sta); /* * Return whether there are any frames still buffered, this is * used to check whether the cleanup timer still needs to run, * if there are no frames we don't need to rearm the timer. */ return !(skb_queue_empty(&sta->ps_tx_buf[ac]) && skb_queue_empty(&sta->tx_filtered[ac])); } static bool sta_info_cleanup_expire_buffered(struct ieee80211_local *local, struct sta_info *sta) { bool have_buffered = false; int ac; /* This is only necessary for stations on BSS/MBSS interfaces */ if (!sta->sdata->bss && !ieee80211_vif_is_mesh(&sta->sdata->vif)) return false; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) have_buffered |= sta_info_cleanup_expire_buffered_ac(local, sta, ac); return have_buffered; } static int __must_check __sta_info_destroy_part1(struct sta_info *sta) { struct ieee80211_local *local; struct ieee80211_sub_if_data *sdata; int ret, i; might_sleep(); if (!sta) return -ENOENT; local = sta->local; sdata = sta->sdata; lockdep_assert_wiphy(local->hw.wiphy); /* * Before removing the station from the driver and * rate control, it might still start new aggregation * sessions -- block that to make sure the tear-down * will be sufficient. */ set_sta_flag(sta, WLAN_STA_BLOCK_BA); ieee80211_sta_tear_down_BA_sessions(sta, AGG_STOP_DESTROY_STA); /* * Before removing the station from the driver there might be pending * rx frames on RSS queues sent prior to the disassociation - wait for * all such frames to be processed. */ drv_sync_rx_queues(local, sta); for (i = 0; i < ARRAY_SIZE(sta->link); i++) { struct link_sta_info *link_sta; if (!(sta->sta.valid_links & BIT(i))) continue; link_sta = rcu_dereference_protected(sta->link[i], lockdep_is_held(&local->hw.wiphy->mtx)); link_sta_info_hash_del(local, link_sta); } ret = sta_info_hash_del(local, sta); if (WARN_ON(ret)) return ret; /* * for TDLS peers, make sure to return to the base channel before * removal. */ if (test_sta_flag(sta, WLAN_STA_TDLS_OFF_CHANNEL)) { drv_tdls_cancel_channel_switch(local, sdata, &sta->sta); clear_sta_flag(sta, WLAN_STA_TDLS_OFF_CHANNEL); } list_del_rcu(&sta->list); sta->removed = true; if (sta->uploaded) drv_sta_pre_rcu_remove(local, sta->sdata, sta); if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN && rcu_access_pointer(sdata->u.vlan.sta) == sta) RCU_INIT_POINTER(sdata->u.vlan.sta, NULL); return 0; } static int _sta_info_move_state(struct sta_info *sta, enum ieee80211_sta_state new_state, bool recalc) { struct ieee80211_local *local = sta->local; might_sleep(); if (sta->sta_state == new_state) return 0; /* check allowed transitions first */ switch (new_state) { case IEEE80211_STA_NONE: if (sta->sta_state != IEEE80211_STA_AUTH) return -EINVAL; break; case IEEE80211_STA_AUTH: if (sta->sta_state != IEEE80211_STA_NONE && sta->sta_state != IEEE80211_STA_ASSOC) return -EINVAL; break; case IEEE80211_STA_ASSOC: if (sta->sta_state != IEEE80211_STA_AUTH && sta->sta_state != IEEE80211_STA_AUTHORIZED) return -EINVAL; break; case IEEE80211_STA_AUTHORIZED: if (sta->sta_state != IEEE80211_STA_ASSOC) return -EINVAL; break; default: WARN(1, "invalid state %d", new_state); return -EINVAL; } sta_dbg(sta->sdata, "moving STA %pM to state %d\n", sta->sta.addr, new_state); /* notify the driver before the actual changes so it can * fail the transition */ if (test_sta_flag(sta, WLAN_STA_INSERTED)) { int err = drv_sta_state(sta->local, sta->sdata, sta, sta->sta_state, new_state); if (err) return err; } /* reflect the change in all state variables */ switch (new_state) { case IEEE80211_STA_NONE: if (sta->sta_state == IEEE80211_STA_AUTH) clear_bit(WLAN_STA_AUTH, &sta->_flags); break; case IEEE80211_STA_AUTH: if (sta->sta_state == IEEE80211_STA_NONE) { set_bit(WLAN_STA_AUTH, &sta->_flags); } else if (sta->sta_state == IEEE80211_STA_ASSOC) { clear_bit(WLAN_STA_ASSOC, &sta->_flags); if (recalc) { ieee80211_recalc_min_chandef(sta->sdata, -1); if (!sta->sta.support_p2p_ps) ieee80211_recalc_p2p_go_ps_allowed(sta->sdata); } } break; case IEEE80211_STA_ASSOC: if (sta->sta_state == IEEE80211_STA_AUTH) { set_bit(WLAN_STA_ASSOC, &sta->_flags); sta->assoc_at = ktime_get_boottime_ns(); if (recalc) { ieee80211_recalc_min_chandef(sta->sdata, -1); if (!sta->sta.support_p2p_ps) ieee80211_recalc_p2p_go_ps_allowed(sta->sdata); } } else if (sta->sta_state == IEEE80211_STA_AUTHORIZED) { ieee80211_vif_dec_num_mcast(sta->sdata); clear_bit(WLAN_STA_AUTHORIZED, &sta->_flags); /* * If we have encryption offload, flush (station) queues * (after ensuring concurrent TX completed) so we won't * transmit anything later unencrypted if/when keys are * also removed, which might otherwise happen depending * on how the hardware offload works. */ if (local->ops->set_key) { synchronize_net(); if (local->ops->flush_sta) drv_flush_sta(local, sta->sdata, sta); else ieee80211_flush_queues(local, sta->sdata, false); } ieee80211_clear_fast_xmit(sta); ieee80211_clear_fast_rx(sta); } break; case IEEE80211_STA_AUTHORIZED: if (sta->sta_state == IEEE80211_STA_ASSOC) { ieee80211_vif_inc_num_mcast(sta->sdata); set_bit(WLAN_STA_AUTHORIZED, &sta->_flags); ieee80211_check_fast_xmit(sta); ieee80211_check_fast_rx(sta); } if (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN || sta->sdata->vif.type == NL80211_IFTYPE_AP) cfg80211_send_layer2_update(sta->sdata->dev, sta->sta.addr); break; default: break; } sta->sta_state = new_state; return 0; } int sta_info_move_state(struct sta_info *sta, enum ieee80211_sta_state new_state) { return _sta_info_move_state(sta, new_state, true); } static void __sta_info_destroy_part2(struct sta_info *sta, bool recalc) { struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; struct station_info *sinfo; int ret; /* * NOTE: This assumes at least synchronize_net() was done * after _part1 and before _part2! */ /* * There's a potential race in _part1 where we set WLAN_STA_BLOCK_BA * but someone might have just gotten past a check, and not yet into * queuing the work/creating the data/etc. * * Do another round of destruction so that the worker is certainly * canceled before we later free the station. * * Since this is after synchronize_rcu()/synchronize_net() we're now * certain that nobody can actually hold a reference to the STA and * be calling e.g. ieee80211_start_tx_ba_session(). */ ieee80211_sta_tear_down_BA_sessions(sta, AGG_STOP_DESTROY_STA); might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (sta->sta_state == IEEE80211_STA_AUTHORIZED) { ret = _sta_info_move_state(sta, IEEE80211_STA_ASSOC, recalc); WARN_ON_ONCE(ret); } /* now keys can no longer be reached */ ieee80211_free_sta_keys(local, sta); /* disable TIM bit - last chance to tell driver */ __sta_info_recalc_tim(sta, true); sta->dead = true; local->num_sta--; local->sta_generation++; while (sta->sta_state > IEEE80211_STA_NONE) { ret = _sta_info_move_state(sta, sta->sta_state - 1, recalc); if (ret) { WARN_ON_ONCE(1); break; } } if (sta->uploaded) { ret = drv_sta_state(local, sdata, sta, IEEE80211_STA_NONE, IEEE80211_STA_NOTEXIST); WARN_ON_ONCE(ret != 0); } sta_dbg(sdata, "Removed STA %pM\n", sta->sta.addr); sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL); if (sinfo) sta_set_sinfo(sta, sinfo, true); cfg80211_del_sta_sinfo(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL); kfree(sinfo); ieee80211_sta_debugfs_remove(sta); ieee80211_destroy_frag_cache(&sta->frags); cleanup_single_sta(sta); } int __must_check __sta_info_destroy(struct sta_info *sta) { int err = __sta_info_destroy_part1(sta); if (err) return err; synchronize_net(); __sta_info_destroy_part2(sta, true); return 0; } int sta_info_destroy_addr(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct sta_info *sta; lockdep_assert_wiphy(sdata->local->hw.wiphy); sta = sta_info_get(sdata, addr); return __sta_info_destroy(sta); } int sta_info_destroy_addr_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct sta_info *sta; lockdep_assert_wiphy(sdata->local->hw.wiphy); sta = sta_info_get_bss(sdata, addr); return __sta_info_destroy(sta); } static void sta_info_cleanup(struct timer_list *t) { struct ieee80211_local *local = from_timer(local, t, sta_cleanup); struct sta_info *sta; bool timer_needed = false; rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) if (sta_info_cleanup_expire_buffered(local, sta)) timer_needed = true; rcu_read_unlock(); if (local->quiescing) return; if (!timer_needed) return; mod_timer(&local->sta_cleanup, round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL)); } int sta_info_init(struct ieee80211_local *local) { int err; err = rhltable_init(&local->sta_hash, &sta_rht_params); if (err) return err; err = rhltable_init(&local->link_sta_hash, &link_sta_rht_params); if (err) { rhltable_destroy(&local->sta_hash); return err; } spin_lock_init(&local->tim_lock); INIT_LIST_HEAD(&local->sta_list); timer_setup(&local->sta_cleanup, sta_info_cleanup, 0); return 0; } void sta_info_stop(struct ieee80211_local *local) { del_timer_sync(&local->sta_cleanup); rhltable_destroy(&local->sta_hash); rhltable_destroy(&local->link_sta_hash); } int __sta_info_flush(struct ieee80211_sub_if_data *sdata, bool vlans, int link_id) { struct ieee80211_local *local = sdata->local; struct sta_info *sta, *tmp; LIST_HEAD(free_list); int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); WARN_ON(vlans && sdata->vif.type != NL80211_IFTYPE_AP); WARN_ON(vlans && !sdata->bss); list_for_each_entry_safe(sta, tmp, &local->sta_list, list) { if (sdata != sta->sdata && (!vlans || sdata->bss != sta->sdata->bss)) continue; if (link_id >= 0 && sta->sta.valid_links && !(sta->sta.valid_links & BIT(link_id))) continue; if (!WARN_ON(__sta_info_destroy_part1(sta))) list_add(&sta->free_list, &free_list); ret++; } if (!list_empty(&free_list)) { bool support_p2p_ps = true; synchronize_net(); list_for_each_entry_safe(sta, tmp, &free_list, free_list) { if (!sta->sta.support_p2p_ps) support_p2p_ps = false; __sta_info_destroy_part2(sta, false); } ieee80211_recalc_min_chandef(sdata, -1); if (!support_p2p_ps) ieee80211_recalc_p2p_go_ps_allowed(sdata); } return ret; } void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata, unsigned long exp_time) { struct ieee80211_local *local = sdata->local; struct sta_info *sta, *tmp; lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry_safe(sta, tmp, &local->sta_list, list) { unsigned long last_active = ieee80211_sta_last_active(sta); if (sdata != sta->sdata) continue; if (time_is_before_jiffies(last_active + exp_time)) { sta_dbg(sta->sdata, "expiring inactive STA %pM\n", sta->sta.addr); if (ieee80211_vif_is_mesh(&sdata->vif) && test_sta_flag(sta, WLAN_STA_PS_STA)) atomic_dec(&sdata->u.mesh.ps.num_sta_ps); WARN_ON(__sta_info_destroy(sta)); } } } struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw, const u8 *addr, const u8 *localaddr) { struct ieee80211_local *local = hw_to_local(hw); struct rhlist_head *tmp; struct sta_info *sta; /* * Just return a random station if localaddr is NULL * ... first in list. */ for_each_sta_info(local, addr, sta, tmp) { if (localaddr && !ether_addr_equal(sta->sdata->vif.addr, localaddr)) continue; if (!sta->uploaded) return NULL; return &sta->sta; } return NULL; } EXPORT_SYMBOL_GPL(ieee80211_find_sta_by_ifaddr); struct ieee80211_sta *ieee80211_find_sta(struct ieee80211_vif *vif, const u8 *addr) { struct sta_info *sta; if (!vif) return NULL; sta = sta_info_get_bss(vif_to_sdata(vif), addr); if (!sta) return NULL; if (!sta->uploaded) return NULL; return &sta->sta; } EXPORT_SYMBOL(ieee80211_find_sta); /* powersave support code */ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct sk_buff_head pending; int filtered = 0, buffered = 0, ac, i; unsigned long flags; struct ps_data *ps; if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); if (sdata->vif.type == NL80211_IFTYPE_AP) ps = &sdata->bss->ps; else if (ieee80211_vif_is_mesh(&sdata->vif)) ps = &sdata->u.mesh.ps; else return; clear_sta_flag(sta, WLAN_STA_SP); BUILD_BUG_ON(BITS_TO_LONGS(IEEE80211_NUM_TIDS) > 1); sta->driver_buffered_tids = 0; sta->txq_buffered_tids = 0; if (!ieee80211_hw_check(&local->hw, AP_LINK_PS)) drv_sta_notify(local, sdata, STA_NOTIFY_AWAKE, &sta->sta); for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { if (!sta->sta.txq[i] || !txq_has_queue(sta->sta.txq[i])) continue; schedule_and_wake_txq(local, to_txq_info(sta->sta.txq[i])); } skb_queue_head_init(&pending); /* sync with ieee80211_tx_h_unicast_ps_buf */ spin_lock_bh(&sta->ps_lock); /* Send all buffered frames to the station */ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { int count = skb_queue_len(&pending), tmp; spin_lock_irqsave(&sta->tx_filtered[ac].lock, flags); skb_queue_splice_tail_init(&sta->tx_filtered[ac], &pending); spin_unlock_irqrestore(&sta->tx_filtered[ac].lock, flags); tmp = skb_queue_len(&pending); filtered += tmp - count; count = tmp; spin_lock_irqsave(&sta->ps_tx_buf[ac].lock, flags); skb_queue_splice_tail_init(&sta->ps_tx_buf[ac], &pending); spin_unlock_irqrestore(&sta->ps_tx_buf[ac].lock, flags); tmp = skb_queue_len(&pending); buffered += tmp - count; } ieee80211_add_pending_skbs(local, &pending); /* now we're no longer in the deliver code */ clear_sta_flag(sta, WLAN_STA_PS_DELIVER); /* The station might have polled and then woken up before we responded, * so clear these flags now to avoid them sticking around. */ clear_sta_flag(sta, WLAN_STA_PSPOLL); clear_sta_flag(sta, WLAN_STA_UAPSD); spin_unlock_bh(&sta->ps_lock); atomic_dec(&ps->num_sta_ps); local->total_ps_buffered -= buffered; sta_info_recalc_tim(sta); ps_dbg(sdata, "STA %pM aid %d sending %d filtered/%d PS frames since STA woke up\n", sta->sta.addr, sta->sta.aid, filtered, buffered); ieee80211_check_fast_xmit(sta); } static void ieee80211_send_null_response(struct sta_info *sta, int tid, enum ieee80211_frame_release_type reason, bool call_driver, bool more_data) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_qos_hdr *nullfunc; struct sk_buff *skb; int size = sizeof(*nullfunc); __le16 fc; bool qos = sta->sta.wme; struct ieee80211_tx_info *info; struct ieee80211_chanctx_conf *chanctx_conf; if (qos) { fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_QOS_NULLFUNC | IEEE80211_FCTL_FROMDS); } else { size -= 2; fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC | IEEE80211_FCTL_FROMDS); } skb = dev_alloc_skb(local->hw.extra_tx_headroom + size); if (!skb) return; skb_reserve(skb, local->hw.extra_tx_headroom); nullfunc = skb_put(skb, size); nullfunc->frame_control = fc; nullfunc->duration_id = 0; memcpy(nullfunc->addr1, sta->sta.addr, ETH_ALEN); memcpy(nullfunc->addr2, sdata->vif.addr, ETH_ALEN); memcpy(nullfunc->addr3, sdata->vif.addr, ETH_ALEN); nullfunc->seq_ctrl = 0; skb->priority = tid; skb_set_queue_mapping(skb, ieee802_1d_to_ac[tid]); if (qos) { nullfunc->qos_ctrl = cpu_to_le16(tid); if (reason == IEEE80211_FRAME_RELEASE_UAPSD) { nullfunc->qos_ctrl |= cpu_to_le16(IEEE80211_QOS_CTL_EOSP); if (more_data) nullfunc->frame_control |= cpu_to_le16(IEEE80211_FCTL_MOREDATA); } } info = IEEE80211_SKB_CB(skb); /* * Tell TX path to send this frame even though the * STA may still remain is PS mode after this frame * exchange. Also set EOSP to indicate this packet * ends the poll/service period. */ info->flags |= IEEE80211_TX_CTL_NO_PS_BUFFER | IEEE80211_TX_STATUS_EOSP | IEEE80211_TX_CTL_REQ_TX_STATUS; info->control.flags |= IEEE80211_TX_CTRL_PS_RESPONSE; if (call_driver) drv_allow_buffered_frames(local, sta, BIT(tid), 1, reason, false); skb->dev = sdata->dev; rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (WARN_ON(!chanctx_conf)) { rcu_read_unlock(); kfree_skb(skb); return; } info->band = chanctx_conf->def.chan->band; ieee80211_xmit(sdata, sta, skb); rcu_read_unlock(); } static int find_highest_prio_tid(unsigned long tids) { /* lower 3 TIDs aren't ordered perfectly */ if (tids & 0xF8) return fls(tids) - 1; /* TID 0 is BE just like TID 3 */ if (tids & BIT(0)) return 0; return fls(tids) - 1; } /* Indicates if the MORE_DATA bit should be set in the last * frame obtained by ieee80211_sta_ps_get_frames. * Note that driver_release_tids is relevant only if * reason = IEEE80211_FRAME_RELEASE_PSPOLL */ static bool ieee80211_sta_ps_more_data(struct sta_info *sta, u8 ignored_acs, enum ieee80211_frame_release_type reason, unsigned long driver_release_tids) { int ac; /* If the driver has data on more than one TID then * certainly there's more data if we release just a * single frame now (from a single TID). This will * only happen for PS-Poll. */ if (reason == IEEE80211_FRAME_RELEASE_PSPOLL && hweight16(driver_release_tids) > 1) return true; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { if (ignored_acs & ieee80211_ac_to_qos_mask[ac]) continue; if (!skb_queue_empty(&sta->tx_filtered[ac]) || !skb_queue_empty(&sta->ps_tx_buf[ac])) return true; } return false; } static void ieee80211_sta_ps_get_frames(struct sta_info *sta, int n_frames, u8 ignored_acs, enum ieee80211_frame_release_type reason, struct sk_buff_head *frames, unsigned long *driver_release_tids) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; int ac; /* Get response frame(s) and more data bit for the last one. */ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { unsigned long tids; if (ignored_acs & ieee80211_ac_to_qos_mask[ac]) continue; tids = ieee80211_tids_for_ac(ac); /* if we already have frames from software, then we can't also * release from hardware queues */ if (skb_queue_empty(frames)) { *driver_release_tids |= sta->driver_buffered_tids & tids; *driver_release_tids |= sta->txq_buffered_tids & tids; } if (!*driver_release_tids) { struct sk_buff *skb; while (n_frames > 0) { skb = skb_dequeue(&sta->tx_filtered[ac]); if (!skb) { skb = skb_dequeue( &sta->ps_tx_buf[ac]); if (skb) local->total_ps_buffered--; } if (!skb) break; n_frames--; __skb_queue_tail(frames, skb); } } /* If we have more frames buffered on this AC, then abort the * loop since we can't send more data from other ACs before * the buffered frames from this. */ if (!skb_queue_empty(&sta->tx_filtered[ac]) || !skb_queue_empty(&sta->ps_tx_buf[ac])) break; } } static void ieee80211_sta_ps_deliver_response(struct sta_info *sta, int n_frames, u8 ignored_acs, enum ieee80211_frame_release_type reason) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; unsigned long driver_release_tids = 0; struct sk_buff_head frames; bool more_data; /* Service or PS-Poll period starts */ set_sta_flag(sta, WLAN_STA_SP); __skb_queue_head_init(&frames); ieee80211_sta_ps_get_frames(sta, n_frames, ignored_acs, reason, &frames, &driver_release_tids); more_data = ieee80211_sta_ps_more_data(sta, ignored_acs, reason, driver_release_tids); if (driver_release_tids && reason == IEEE80211_FRAME_RELEASE_PSPOLL) driver_release_tids = BIT(find_highest_prio_tid(driver_release_tids)); if (skb_queue_empty(&frames) && !driver_release_tids) { int tid, ac; /* * For PS-Poll, this can only happen due to a race condition * when we set the TIM bit and the station notices it, but * before it can poll for the frame we expire it. * * For uAPSD, this is said in the standard (11.2.1.5 h): * At each unscheduled SP for a non-AP STA, the AP shall * attempt to transmit at least one MSDU or MMPDU, but no * more than the value specified in the Max SP Length field * in the QoS Capability element from delivery-enabled ACs, * that are destined for the non-AP STA. * * Since we have no other MSDU/MMPDU, transmit a QoS null frame. */ /* This will evaluate to 1, 3, 5 or 7. */ for (ac = IEEE80211_AC_VO; ac < IEEE80211_NUM_ACS; ac++) if (!(ignored_acs & ieee80211_ac_to_qos_mask[ac])) break; tid = 7 - 2 * ac; ieee80211_send_null_response(sta, tid, reason, true, false); } else if (!driver_release_tids) { struct sk_buff_head pending; struct sk_buff *skb; int num = 0; u16 tids = 0; bool need_null = false; skb_queue_head_init(&pending); while ((skb = __skb_dequeue(&frames))) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (void *) skb->data; u8 *qoshdr = NULL; num++; /* * Tell TX path to send this frame even though the * STA may still remain is PS mode after this frame * exchange. */ info->flags |= IEEE80211_TX_CTL_NO_PS_BUFFER; info->control.flags |= IEEE80211_TX_CTRL_PS_RESPONSE; /* * Use MoreData flag to indicate whether there are * more buffered frames for this STA */ if (more_data || !skb_queue_empty(&frames)) hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_MOREDATA); else hdr->frame_control &= cpu_to_le16(~IEEE80211_FCTL_MOREDATA); if (ieee80211_is_data_qos(hdr->frame_control) || ieee80211_is_qos_nullfunc(hdr->frame_control)) qoshdr = ieee80211_get_qos_ctl(hdr); tids |= BIT(skb->priority); __skb_queue_tail(&pending, skb); /* end service period after last frame or add one */ if (!skb_queue_empty(&frames)) continue; if (reason != IEEE80211_FRAME_RELEASE_UAPSD) { /* for PS-Poll, there's only one frame */ info->flags |= IEEE80211_TX_STATUS_EOSP | IEEE80211_TX_CTL_REQ_TX_STATUS; break; } /* For uAPSD, things are a bit more complicated. If the * last frame has a QoS header (i.e. is a QoS-data or * QoS-nulldata frame) then just set the EOSP bit there * and be done. * If the frame doesn't have a QoS header (which means * it should be a bufferable MMPDU) then we can't set * the EOSP bit in the QoS header; add a QoS-nulldata * frame to the list to send it after the MMPDU. * * Note that this code is only in the mac80211-release * code path, we assume that the driver will not buffer * anything but QoS-data frames, or if it does, will * create the QoS-nulldata frame by itself if needed. * * Cf. 802.11-2012 10.2.1.10 (c). */ if (qoshdr) { *qoshdr |= IEEE80211_QOS_CTL_EOSP; info->flags |= IEEE80211_TX_STATUS_EOSP | IEEE80211_TX_CTL_REQ_TX_STATUS; } else { /* The standard isn't completely clear on this * as it says the more-data bit should be set * if there are more BUs. The QoS-Null frame * we're about to send isn't buffered yet, we * only create it below, but let's pretend it * was buffered just in case some clients only * expect more-data=0 when eosp=1. */ hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_MOREDATA); need_null = true; num++; } break; } drv_allow_buffered_frames(local, sta, tids, num, reason, more_data); ieee80211_add_pending_skbs(local, &pending); if (need_null) ieee80211_send_null_response( sta, find_highest_prio_tid(tids), reason, false, false); sta_info_recalc_tim(sta); } else { int tid; /* * We need to release a frame that is buffered somewhere in the * driver ... it'll have to handle that. * Note that the driver also has to check the number of frames * on the TIDs we're releasing from - if there are more than * n_frames it has to set the more-data bit (if we didn't ask * it to set it anyway due to other buffered frames); if there * are fewer than n_frames it has to make sure to adjust that * to allow the service period to end properly. */ drv_release_buffered_frames(local, sta, driver_release_tids, n_frames, reason, more_data); /* * Note that we don't recalculate the TIM bit here as it would * most likely have no effect at all unless the driver told us * that the TID(s) became empty before returning here from the * release function. * Either way, however, when the driver tells us that the TID(s) * became empty or we find that a txq became empty, we'll do the * TIM recalculation. */ for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) { if (!sta->sta.txq[tid] || !(driver_release_tids & BIT(tid)) || txq_has_queue(sta->sta.txq[tid])) continue; sta_info_recalc_tim(sta); break; } } } void ieee80211_sta_ps_deliver_poll_response(struct sta_info *sta) { u8 ignore_for_response = sta->sta.uapsd_queues; /* * If all ACs are delivery-enabled then we should reply * from any of them, if only some are enabled we reply * only from the non-enabled ones. */ if (ignore_for_response == BIT(IEEE80211_NUM_ACS) - 1) ignore_for_response = 0; ieee80211_sta_ps_deliver_response(sta, 1, ignore_for_response, IEEE80211_FRAME_RELEASE_PSPOLL); } void ieee80211_sta_ps_deliver_uapsd(struct sta_info *sta) { int n_frames = sta->sta.max_sp; u8 delivery_enabled = sta->sta.uapsd_queues; /* * If we ever grow support for TSPEC this might happen if * the TSPEC update from hostapd comes in between a trigger * frame setting WLAN_STA_UAPSD in the RX path and this * actually getting called. */ if (!delivery_enabled) return; switch (sta->sta.max_sp) { case 1: n_frames = 2; break; case 2: n_frames = 4; break; case 3: n_frames = 6; break; case 0: /* XXX: what is a good value? */ n_frames = 128; break; } ieee80211_sta_ps_deliver_response(sta, n_frames, ~delivery_enabled, IEEE80211_FRAME_RELEASE_UAPSD); } void ieee80211_sta_block_awake(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, bool block) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); trace_api_sta_block_awake(sta->local, pubsta, block); if (block) { set_sta_flag(sta, WLAN_STA_PS_DRIVER); ieee80211_clear_fast_xmit(sta); return; } if (!test_sta_flag(sta, WLAN_STA_PS_DRIVER)) return; if (!test_sta_flag(sta, WLAN_STA_PS_STA)) { set_sta_flag(sta, WLAN_STA_PS_DELIVER); clear_sta_flag(sta, WLAN_STA_PS_DRIVER); ieee80211_queue_work(hw, &sta->drv_deliver_wk); } else if (test_sta_flag(sta, WLAN_STA_PSPOLL) || test_sta_flag(sta, WLAN_STA_UAPSD)) { /* must be asleep in this case */ clear_sta_flag(sta, WLAN_STA_PS_DRIVER); ieee80211_queue_work(hw, &sta->drv_deliver_wk); } else { clear_sta_flag(sta, WLAN_STA_PS_DRIVER); ieee80211_check_fast_xmit(sta); } } EXPORT_SYMBOL(ieee80211_sta_block_awake); void ieee80211_sta_eosp(struct ieee80211_sta *pubsta) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); struct ieee80211_local *local = sta->local; trace_api_eosp(local, pubsta); clear_sta_flag(sta, WLAN_STA_SP); } EXPORT_SYMBOL(ieee80211_sta_eosp); void ieee80211_send_eosp_nullfunc(struct ieee80211_sta *pubsta, int tid) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); enum ieee80211_frame_release_type reason; bool more_data; trace_api_send_eosp_nullfunc(sta->local, pubsta, tid); reason = IEEE80211_FRAME_RELEASE_UAPSD; more_data = ieee80211_sta_ps_more_data(sta, ~sta->sta.uapsd_queues, reason, 0); ieee80211_send_null_response(sta, tid, reason, false, more_data); } EXPORT_SYMBOL(ieee80211_send_eosp_nullfunc); void ieee80211_sta_set_buffered(struct ieee80211_sta *pubsta, u8 tid, bool buffered) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); if (WARN_ON(tid >= IEEE80211_NUM_TIDS)) return; trace_api_sta_set_buffered(sta->local, pubsta, tid, buffered); if (buffered) set_bit(tid, &sta->driver_buffered_tids); else clear_bit(tid, &sta->driver_buffered_tids); sta_info_recalc_tim(sta); } EXPORT_SYMBOL(ieee80211_sta_set_buffered); void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid, u32 tx_airtime, u32 rx_airtime) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); struct ieee80211_local *local = sta->sdata->local; u8 ac = ieee80211_ac_from_tid(tid); u32 airtime = 0; if (sta->local->airtime_flags & AIRTIME_USE_TX) airtime += tx_airtime; if (sta->local->airtime_flags & AIRTIME_USE_RX) airtime += rx_airtime; spin_lock_bh(&local->active_txq_lock[ac]); sta->airtime[ac].tx_airtime += tx_airtime; sta->airtime[ac].rx_airtime += rx_airtime; if (ieee80211_sta_keep_active(sta, ac)) sta->airtime[ac].deficit -= airtime; spin_unlock_bh(&local->active_txq_lock[ac]); } EXPORT_SYMBOL(ieee80211_sta_register_airtime); void __ieee80211_sta_recalc_aggregates(struct sta_info *sta, u16 active_links) { bool first = true; int link_id; if (!sta->sta.valid_links || !sta->sta.mlo) { sta->sta.cur = &sta->sta.deflink.agg; return; } rcu_read_lock(); for (link_id = 0; link_id < ARRAY_SIZE((sta)->link); link_id++) { struct ieee80211_link_sta *link_sta; int i; if (!(active_links & BIT(link_id))) continue; link_sta = rcu_dereference(sta->sta.link[link_id]); if (!link_sta) continue; if (first) { sta->cur = sta->sta.deflink.agg; first = false; continue; } sta->cur.max_amsdu_len = min(sta->cur.max_amsdu_len, link_sta->agg.max_amsdu_len); sta->cur.max_rc_amsdu_len = min(sta->cur.max_rc_amsdu_len, link_sta->agg.max_rc_amsdu_len); for (i = 0; i < ARRAY_SIZE(sta->cur.max_tid_amsdu_len); i++) sta->cur.max_tid_amsdu_len[i] = min(sta->cur.max_tid_amsdu_len[i], link_sta->agg.max_tid_amsdu_len[i]); } rcu_read_unlock(); sta->sta.cur = &sta->cur; } void ieee80211_sta_recalc_aggregates(struct ieee80211_sta *pubsta) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); __ieee80211_sta_recalc_aggregates(sta, sta->sdata->vif.active_links); } EXPORT_SYMBOL(ieee80211_sta_recalc_aggregates); void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local, struct sta_info *sta, u8 ac, u16 tx_airtime, bool tx_completed) { int tx_pending; if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) return; if (!tx_completed) { if (sta) atomic_add(tx_airtime, &sta->airtime[ac].aql_tx_pending); atomic_add(tx_airtime, &local->aql_total_pending_airtime); atomic_add(tx_airtime, &local->aql_ac_pending_airtime[ac]); return; } if (sta) { tx_pending = atomic_sub_return(tx_airtime, &sta->airtime[ac].aql_tx_pending); if (tx_pending < 0) atomic_cmpxchg(&sta->airtime[ac].aql_tx_pending, tx_pending, 0); } atomic_sub(tx_airtime, &local->aql_total_pending_airtime); tx_pending = atomic_sub_return(tx_airtime, &local->aql_ac_pending_airtime[ac]); if (WARN_ONCE(tx_pending < 0, "Device %s AC %d pending airtime underflow: %u, %u", wiphy_name(local->hw.wiphy), ac, tx_pending, tx_airtime)) { atomic_cmpxchg(&local->aql_ac_pending_airtime[ac], tx_pending, 0); atomic_sub(tx_pending, &local->aql_total_pending_airtime); } } static struct ieee80211_sta_rx_stats * sta_get_last_rx_stats(struct sta_info *sta) { struct ieee80211_sta_rx_stats *stats = &sta->deflink.rx_stats; int cpu; if (!sta->deflink.pcpu_rx_stats) return stats; for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpustats; cpustats = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu); if (time_after(cpustats->last_rx, stats->last_rx)) stats = cpustats; } return stats; } static void sta_stats_decode_rate(struct ieee80211_local *local, u32 rate, struct rate_info *rinfo) { rinfo->bw = STA_STATS_GET(BW, rate); switch (STA_STATS_GET(TYPE, rate)) { case STA_STATS_RATE_TYPE_VHT: rinfo->flags = RATE_INFO_FLAGS_VHT_MCS; rinfo->mcs = STA_STATS_GET(VHT_MCS, rate); rinfo->nss = STA_STATS_GET(VHT_NSS, rate); if (STA_STATS_GET(SGI, rate)) rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI; break; case STA_STATS_RATE_TYPE_HT: rinfo->flags = RATE_INFO_FLAGS_MCS; rinfo->mcs = STA_STATS_GET(HT_MCS, rate); if (STA_STATS_GET(SGI, rate)) rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI; break; case STA_STATS_RATE_TYPE_LEGACY: { struct ieee80211_supported_band *sband; u16 brate; unsigned int shift; int band = STA_STATS_GET(LEGACY_BAND, rate); int rate_idx = STA_STATS_GET(LEGACY_IDX, rate); sband = local->hw.wiphy->bands[band]; if (WARN_ON_ONCE(!sband->bitrates)) break; brate = sband->bitrates[rate_idx].bitrate; if (rinfo->bw == RATE_INFO_BW_5) shift = 2; else if (rinfo->bw == RATE_INFO_BW_10) shift = 1; else shift = 0; rinfo->legacy = DIV_ROUND_UP(brate, 1 << shift); break; } case STA_STATS_RATE_TYPE_HE: rinfo->flags = RATE_INFO_FLAGS_HE_MCS; rinfo->mcs = STA_STATS_GET(HE_MCS, rate); rinfo->nss = STA_STATS_GET(HE_NSS, rate); rinfo->he_gi = STA_STATS_GET(HE_GI, rate); rinfo->he_ru_alloc = STA_STATS_GET(HE_RU, rate); rinfo->he_dcm = STA_STATS_GET(HE_DCM, rate); break; case STA_STATS_RATE_TYPE_EHT: rinfo->flags = RATE_INFO_FLAGS_EHT_MCS; rinfo->mcs = STA_STATS_GET(EHT_MCS, rate); rinfo->nss = STA_STATS_GET(EHT_NSS, rate); rinfo->eht_gi = STA_STATS_GET(EHT_GI, rate); rinfo->eht_ru_alloc = STA_STATS_GET(EHT_RU, rate); break; } } static int sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo) { u32 rate = READ_ONCE(sta_get_last_rx_stats(sta)->last_rate); if (rate == STA_STATS_RATE_INVALID) return -EINVAL; sta_stats_decode_rate(sta->local, rate, rinfo); return 0; } static inline u64 sta_get_tidstats_msdu(struct ieee80211_sta_rx_stats *rxstats, int tid) { unsigned int start; u64 value; do { start = u64_stats_fetch_begin(&rxstats->syncp); value = rxstats->msdu[tid]; } while (u64_stats_fetch_retry(&rxstats->syncp, start)); return value; } static void sta_set_tidstats(struct sta_info *sta, struct cfg80211_tid_stats *tidstats, int tid) { struct ieee80211_local *local = sta->local; int cpu; if (!(tidstats->filled & BIT(NL80211_TID_STATS_RX_MSDU))) { tidstats->rx_msdu += sta_get_tidstats_msdu(&sta->deflink.rx_stats, tid); if (sta->deflink.pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; cpurxs = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu); tidstats->rx_msdu += sta_get_tidstats_msdu(cpurxs, tid); } } tidstats->filled |= BIT(NL80211_TID_STATS_RX_MSDU); } if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU))) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU); tidstats->tx_msdu = sta->deflink.tx_stats.msdu[tid]; } if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU_RETRIES)) && ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_RETRIES); tidstats->tx_msdu_retries = sta->deflink.status_stats.msdu_retries[tid]; } if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU_FAILED)) && ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_FAILED); tidstats->tx_msdu_failed = sta->deflink.status_stats.msdu_failed[tid]; } if (tid < IEEE80211_NUM_TIDS) { spin_lock_bh(&local->fq.lock); rcu_read_lock(); tidstats->filled |= BIT(NL80211_TID_STATS_TXQ_STATS); ieee80211_fill_txq_stats(&tidstats->txq_stats, to_txq_info(sta->sta.txq[tid])); rcu_read_unlock(); spin_unlock_bh(&local->fq.lock); } } static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats) { unsigned int start; u64 value; do { start = u64_stats_fetch_begin(&rxstats->syncp); value = rxstats->bytes; } while (u64_stats_fetch_retry(&rxstats->syncp, start)); return value; } void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, bool tidstats) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; u32 thr = 0; int i, ac, cpu; struct ieee80211_sta_rx_stats *last_rxstats; last_rxstats = sta_get_last_rx_stats(sta); sinfo->generation = sdata->local->sta_generation; /* do before driver, so beacon filtering drivers have a * chance to e.g. just add the number of filtered beacons * (or just modify the value entirely, of course) */ if (sdata->vif.type == NL80211_IFTYPE_STATION) sinfo->rx_beacon = sdata->deflink.u.mgd.count_beacon_signal; drv_sta_statistics(local, sdata, &sta->sta, sinfo); sinfo->filled |= BIT_ULL(NL80211_STA_INFO_INACTIVE_TIME) | BIT_ULL(NL80211_STA_INFO_STA_FLAGS) | BIT_ULL(NL80211_STA_INFO_BSS_PARAM) | BIT_ULL(NL80211_STA_INFO_CONNECTED_TIME) | BIT_ULL(NL80211_STA_INFO_ASSOC_AT_BOOTTIME) | BIT_ULL(NL80211_STA_INFO_RX_DROP_MISC); if (sdata->vif.type == NL80211_IFTYPE_STATION) { sinfo->beacon_loss_count = sdata->deflink.u.mgd.beacon_loss_count; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_LOSS); } sinfo->connected_time = ktime_get_seconds() - sta->last_connected; sinfo->assoc_at = sta->assoc_at; sinfo->inactive_time = jiffies_to_msecs(jiffies - ieee80211_sta_last_active(sta)); if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_TX_BYTES64) | BIT_ULL(NL80211_STA_INFO_TX_BYTES)))) { sinfo->tx_bytes = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) sinfo->tx_bytes += sta->deflink.tx_stats.bytes[ac]; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BYTES64); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_PACKETS))) { sinfo->tx_packets = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) sinfo->tx_packets += sta->deflink.tx_stats.packets[ac]; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_PACKETS); } if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_RX_BYTES64) | BIT_ULL(NL80211_STA_INFO_RX_BYTES)))) { sinfo->rx_bytes += sta_get_stats_bytes(&sta->deflink.rx_stats); if (sta->deflink.pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; cpurxs = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu); sinfo->rx_bytes += sta_get_stats_bytes(cpurxs); } } sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BYTES64); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_PACKETS))) { sinfo->rx_packets = sta->deflink.rx_stats.packets; if (sta->deflink.pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; cpurxs = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu); sinfo->rx_packets += cpurxs->packets; } } sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_PACKETS); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_RETRIES))) { sinfo->tx_retries = sta->deflink.status_stats.retry_count; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_RETRIES); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_FAILED))) { sinfo->tx_failed = sta->deflink.status_stats.retry_failed; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_FAILED); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_DURATION))) { for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) sinfo->rx_duration += sta->airtime[ac].rx_airtime; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_DURATION); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_DURATION))) { for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) sinfo->tx_duration += sta->airtime[ac].tx_airtime; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_DURATION); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_AIRTIME_WEIGHT))) { sinfo->airtime_weight = sta->airtime_weight; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_AIRTIME_WEIGHT); } sinfo->rx_dropped_misc = sta->deflink.rx_stats.dropped; if (sta->deflink.pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; cpurxs = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu); sinfo->rx_dropped_misc += cpurxs->dropped; } } if (sdata->vif.type == NL80211_IFTYPE_STATION && !(sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER)) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_RX) | BIT_ULL(NL80211_STA_INFO_BEACON_SIGNAL_AVG); sinfo->rx_beacon_signal_avg = ieee80211_ave_rssi(&sdata->vif); } if (ieee80211_hw_check(&sta->local->hw, SIGNAL_DBM) || ieee80211_hw_check(&sta->local->hw, SIGNAL_UNSPEC)) { if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL))) { sinfo->signal = (s8)last_rxstats->last_signal; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL); } if (!sta->deflink.pcpu_rx_stats && !(sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG))) { sinfo->signal_avg = -ewma_signal_read(&sta->deflink.rx_stats_avg.signal); sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG); } } /* for the average - if pcpu_rx_stats isn't set - rxstats must point to * the sta->rx_stats struct, so the check here is fine with and without * pcpu statistics */ if (last_rxstats->chains && !(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL) | BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)))) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL); if (!sta->deflink.pcpu_rx_stats) sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG); sinfo->chains = last_rxstats->chains; for (i = 0; i < ARRAY_SIZE(sinfo->chain_signal); i++) { sinfo->chain_signal[i] = last_rxstats->chain_signal_last[i]; sinfo->chain_signal_avg[i] = -ewma_signal_read(&sta->deflink.rx_stats_avg.chain_signal[i]); } } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE)) && !sta->sta.valid_links && ieee80211_rate_valid(&sta->deflink.tx_stats.last_rate)) { sta_set_rate_info_tx(sta, &sta->deflink.tx_stats.last_rate, &sinfo->txrate); sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BITRATE); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE)) && !sta->sta.valid_links) { if (sta_set_rate_info_rx(sta, &sinfo->rxrate) == 0) sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BITRATE); } if (tidstats && !cfg80211_sinfo_alloc_tid_stats(sinfo, GFP_KERNEL)) { for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) sta_set_tidstats(sta, &sinfo->pertid[i], i); } if (ieee80211_vif_is_mesh(&sdata->vif)) { #ifdef CONFIG_MAC80211_MESH sinfo->filled |= BIT_ULL(NL80211_STA_INFO_LLID) | BIT_ULL(NL80211_STA_INFO_PLID) | BIT_ULL(NL80211_STA_INFO_PLINK_STATE) | BIT_ULL(NL80211_STA_INFO_LOCAL_PM) | BIT_ULL(NL80211_STA_INFO_PEER_PM) | BIT_ULL(NL80211_STA_INFO_NONPEER_PM) | BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_GATE) | BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_AS); sinfo->llid = sta->mesh->llid; sinfo->plid = sta->mesh->plid; sinfo->plink_state = sta->mesh->plink_state; if (test_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN)) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_T_OFFSET); sinfo->t_offset = sta->mesh->t_offset; } sinfo->local_pm = sta->mesh->local_pm; sinfo->peer_pm = sta->mesh->peer_pm; sinfo->nonpeer_pm = sta->mesh->nonpeer_pm; sinfo->connected_to_gate = sta->mesh->connected_to_gate; sinfo->connected_to_as = sta->mesh->connected_to_as; #endif } sinfo->bss_param.flags = 0; if (sdata->vif.bss_conf.use_cts_prot) sinfo->bss_param.flags |= BSS_PARAM_FLAGS_CTS_PROT; if (sdata->vif.bss_conf.use_short_preamble) sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_PREAMBLE; if (sdata->vif.bss_conf.use_short_slot) sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_SLOT_TIME; sinfo->bss_param.dtim_period = sdata->vif.bss_conf.dtim_period; sinfo->bss_param.beacon_interval = sdata->vif.bss_conf.beacon_int; sinfo->sta_flags.set = 0; sinfo->sta_flags.mask = BIT(NL80211_STA_FLAG_AUTHORIZED) | BIT(NL80211_STA_FLAG_SHORT_PREAMBLE) | BIT(NL80211_STA_FLAG_WME) | BIT(NL80211_STA_FLAG_MFP) | BIT(NL80211_STA_FLAG_AUTHENTICATED) | BIT(NL80211_STA_FLAG_ASSOCIATED) | BIT(NL80211_STA_FLAG_TDLS_PEER); if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_AUTHORIZED); if (test_sta_flag(sta, WLAN_STA_SHORT_PREAMBLE)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_SHORT_PREAMBLE); if (sta->sta.wme) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_WME); if (test_sta_flag(sta, WLAN_STA_MFP)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_MFP); if (test_sta_flag(sta, WLAN_STA_AUTH)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_AUTHENTICATED); if (test_sta_flag(sta, WLAN_STA_ASSOC)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_ASSOCIATED); if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_TDLS_PEER); thr = sta_get_expected_throughput(sta); if (thr != 0) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_EXPECTED_THROUGHPUT); sinfo->expected_throughput = thr; } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL)) && sta->deflink.status_stats.ack_signal_filled) { sinfo->ack_signal = sta->deflink.status_stats.last_ack_signal; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG)) && sta->deflink.status_stats.ack_signal_filled) { sinfo->avg_ack_signal = -(s8)ewma_avg_signal_read( &sta->deflink.status_stats.avg_ack_signal); sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG); } if (ieee80211_vif_is_mesh(&sdata->vif)) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_AIRTIME_LINK_METRIC); sinfo->airtime_link_metric = airtime_link_metric_get(local, sta); } } u32 sta_get_expected_throughput(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct rate_control_ref *ref = NULL; u32 thr = 0; if (test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) ref = local->rate_ctrl; /* check if the driver has a SW RC implementation */ if (ref && ref->ops->get_expected_throughput) thr = ref->ops->get_expected_throughput(sta->rate_ctrl_priv); else thr = drv_get_expected_throughput(local, sta); return thr; } unsigned long ieee80211_sta_last_active(struct sta_info *sta) { struct ieee80211_sta_rx_stats *stats = sta_get_last_rx_stats(sta); if (!sta->deflink.status_stats.last_ack || time_after(stats->last_rx, sta->deflink.status_stats.last_ack)) return stats->last_rx; return sta->deflink.status_stats.last_ack; } static void sta_update_codel_params(struct sta_info *sta, u32 thr) { if (thr && thr < STA_SLOW_THRESHOLD * sta->local->num_sta) { sta->cparams.target = MS2TIME(50); sta->cparams.interval = MS2TIME(300); sta->cparams.ecn = false; } else { sta->cparams.target = MS2TIME(20); sta->cparams.interval = MS2TIME(100); sta->cparams.ecn = true; } } void ieee80211_sta_set_expected_throughput(struct ieee80211_sta *pubsta, u32 thr) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); sta_update_codel_params(sta, thr); } int ieee80211_sta_allocate_link(struct sta_info *sta, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct sta_link_alloc *alloc; int ret; lockdep_assert_wiphy(sdata->local->hw.wiphy); WARN_ON(!test_sta_flag(sta, WLAN_STA_INSERTED)); /* must represent an MLD from the start */ if (WARN_ON(!sta->sta.valid_links)) return -EINVAL; if (WARN_ON(sta->sta.valid_links & BIT(link_id) || sta->link[link_id])) return -EBUSY; alloc = kzalloc(sizeof(*alloc), GFP_KERNEL); if (!alloc) return -ENOMEM; ret = sta_info_alloc_link(sdata->local, &alloc->info, GFP_KERNEL); if (ret) { kfree(alloc); return ret; } sta_info_add_link(sta, link_id, &alloc->info, &alloc->sta); ieee80211_link_sta_debugfs_add(&alloc->info); return 0; } void ieee80211_sta_free_link(struct sta_info *sta, unsigned int link_id) { lockdep_assert_wiphy(sta->sdata->local->hw.wiphy); WARN_ON(!test_sta_flag(sta, WLAN_STA_INSERTED)); sta_remove_link(sta, link_id, false); } int ieee80211_sta_activate_link(struct sta_info *sta, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct link_sta_info *link_sta; u16 old_links = sta->sta.valid_links; u16 new_links = old_links | BIT(link_id); int ret; link_sta = rcu_dereference_protected(sta->link[link_id], lockdep_is_held(&sdata->local->hw.wiphy->mtx)); if (WARN_ON(old_links == new_links || !link_sta)) return -EINVAL; rcu_read_lock(); if (link_sta_info_hash_lookup(sdata->local, link_sta->addr)) { rcu_read_unlock(); return -EALREADY; } /* we only modify under the mutex so this is fine */ rcu_read_unlock(); sta->sta.valid_links = new_links; if (WARN_ON(!test_sta_flag(sta, WLAN_STA_INSERTED))) goto hash; ieee80211_recalc_min_chandef(sdata, link_id); /* Ensure the values are updated for the driver, * redone by sta_remove_link on failure. */ ieee80211_sta_recalc_aggregates(&sta->sta); ret = drv_change_sta_links(sdata->local, sdata, &sta->sta, old_links, new_links); if (ret) { sta->sta.valid_links = old_links; sta_remove_link(sta, link_id, false); return ret; } hash: ret = link_sta_info_hash_add(sdata->local, link_sta); WARN_ON(ret); return 0; } void ieee80211_sta_remove_link(struct sta_info *sta, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = sta->sdata; u16 old_links = sta->sta.valid_links; lockdep_assert_wiphy(sdata->local->hw.wiphy); sta->sta.valid_links &= ~BIT(link_id); if (!WARN_ON(!test_sta_flag(sta, WLAN_STA_INSERTED))) drv_change_sta_links(sdata->local, sdata, &sta->sta, old_links, sta->sta.valid_links); sta_remove_link(sta, link_id, true); } void ieee80211_sta_set_max_amsdu_subframes(struct sta_info *sta, const u8 *ext_capab, unsigned int ext_capab_len) { u8 val; sta->sta.max_amsdu_subframes = 0; if (ext_capab_len < 8) return; /* The sender might not have sent the last bit, consider it to be 0 */ val = u8_get_bits(ext_capab[7], WLAN_EXT_CAPA8_MAX_MSDU_IN_AMSDU_LSB); /* we did get all the bits, take the MSB as well */ if (ext_capab_len >= 9) val |= u8_get_bits(ext_capab[8], WLAN_EXT_CAPA9_MAX_MSDU_IN_AMSDU_MSB) << 1; if (val) sta->sta.max_amsdu_subframes = 4 << (4 - val); } #ifdef CONFIG_LOCKDEP bool lockdep_sta_mutex_held(struct ieee80211_sta *pubsta) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); return lockdep_is_held(&sta->local->hw.wiphy->mtx); } EXPORT_SYMBOL(lockdep_sta_mutex_held); #endif |
| 1 2 2 2 1 1 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 | // SPDX-License-Identifier: GPL-2.0 /* * Interface for controlling IO bandwidth on a request queue * * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com> */ #include <linux/module.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/bio.h> #include <linux/blktrace_api.h> #include "blk.h" #include "blk-cgroup-rwstat.h" #include "blk-stat.h" #include "blk-throttle.h" /* Max dispatch from a group in 1 round */ #define THROTL_GRP_QUANTUM 8 /* Total max dispatch from all groups in one round */ #define THROTL_QUANTUM 32 /* Throttling is performed over a slice and after that slice is renewed */ #define DFL_THROTL_SLICE_HD (HZ / 10) #define DFL_THROTL_SLICE_SSD (HZ / 50) #define MAX_THROTL_SLICE (HZ) /* A workqueue to queue throttle related work */ static struct workqueue_struct *kthrotld_workqueue; #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) struct throtl_data { /* service tree for active throtl groups */ struct throtl_service_queue service_queue; struct request_queue *queue; /* Total Number of queued bios on READ and WRITE lists */ unsigned int nr_queued[2]; unsigned int throtl_slice; /* Work for dispatching throttled bios */ struct work_struct dispatch_work; bool track_bio_latency; }; static void throtl_pending_timer_fn(struct timer_list *t); static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) { return pd_to_blkg(&tg->pd); } /** * sq_to_tg - return the throl_grp the specified service queue belongs to * @sq: the throtl_service_queue of interest * * Return the throtl_grp @sq belongs to. If @sq is the top-level one * embedded in throtl_data, %NULL is returned. */ static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq) { if (sq && sq->parent_sq) return container_of(sq, struct throtl_grp, service_queue); else return NULL; } /** * sq_to_td - return throtl_data the specified service queue belongs to * @sq: the throtl_service_queue of interest * * A service_queue can be embedded in either a throtl_grp or throtl_data. * Determine the associated throtl_data accordingly and return it. */ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) { struct throtl_grp *tg = sq_to_tg(sq); if (tg) return tg->td; else return container_of(sq, struct throtl_data, service_queue); } static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw) { struct blkcg_gq *blkg = tg_to_blkg(tg); if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) return U64_MAX; return tg->bps[rw]; } static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw) { struct blkcg_gq *blkg = tg_to_blkg(tg); if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) return UINT_MAX; return tg->iops[rw]; } /** * throtl_log - log debug message via blktrace * @sq: the service_queue being reported * @fmt: printf format string * @args: printf args * * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a * throtl_grp; otherwise, just "throtl". */ #define throtl_log(sq, fmt, args...) do { \ struct throtl_grp *__tg = sq_to_tg((sq)); \ struct throtl_data *__td = sq_to_td((sq)); \ \ (void)__td; \ if (likely(!blk_trace_note_message_enabled(__td->queue))) \ break; \ if ((__tg)) { \ blk_add_cgroup_trace_msg(__td->queue, \ &tg_to_blkg(__tg)->blkcg->css, "throtl " fmt, ##args);\ } else { \ blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \ } \ } while (0) static inline unsigned int throtl_bio_data_size(struct bio *bio) { /* assume it's one sector */ if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) return 512; return bio->bi_iter.bi_size; } static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) { INIT_LIST_HEAD(&qn->node); bio_list_init(&qn->bios); qn->tg = tg; } /** * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it * @bio: bio being added * @qn: qnode to add bio to * @queued: the service_queue->queued[] list @qn belongs to * * Add @bio to @qn and put @qn on @queued if it's not already on. * @qn->tg's reference count is bumped when @qn is activated. See the * comment on top of throtl_qnode definition for details. */ static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn, struct list_head *queued) { bio_list_add(&qn->bios, bio); if (list_empty(&qn->node)) { list_add_tail(&qn->node, queued); blkg_get(tg_to_blkg(qn->tg)); } } /** * throtl_peek_queued - peek the first bio on a qnode list * @queued: the qnode list to peek */ static struct bio *throtl_peek_queued(struct list_head *queued) { struct throtl_qnode *qn; struct bio *bio; if (list_empty(queued)) return NULL; qn = list_first_entry(queued, struct throtl_qnode, node); bio = bio_list_peek(&qn->bios); WARN_ON_ONCE(!bio); return bio; } /** * throtl_pop_queued - pop the first bio form a qnode list * @queued: the qnode list to pop a bio from * @tg_to_put: optional out argument for throtl_grp to put * * Pop the first bio from the qnode list @queued. After popping, the first * qnode is removed from @queued if empty or moved to the end of @queued so * that the popping order is round-robin. * * When the first qnode is removed, its associated throtl_grp should be put * too. If @tg_to_put is NULL, this function automatically puts it; * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is * responsible for putting it. */ static struct bio *throtl_pop_queued(struct list_head *queued, struct throtl_grp **tg_to_put) { struct throtl_qnode *qn; struct bio *bio; if (list_empty(queued)) return NULL; qn = list_first_entry(queued, struct throtl_qnode, node); bio = bio_list_pop(&qn->bios); WARN_ON_ONCE(!bio); if (bio_list_empty(&qn->bios)) { list_del_init(&qn->node); if (tg_to_put) *tg_to_put = qn->tg; else blkg_put(tg_to_blkg(qn->tg)); } else { list_move_tail(&qn->node, queued); } return bio; } /* init a service_queue, assumes the caller zeroed it */ static void throtl_service_queue_init(struct throtl_service_queue *sq) { INIT_LIST_HEAD(&sq->queued[READ]); INIT_LIST_HEAD(&sq->queued[WRITE]); sq->pending_tree = RB_ROOT_CACHED; timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0); } static struct blkg_policy_data *throtl_pd_alloc(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp) { struct throtl_grp *tg; int rw; tg = kzalloc_node(sizeof(*tg), gfp, disk->node_id); if (!tg) return NULL; if (blkg_rwstat_init(&tg->stat_bytes, gfp)) goto err_free_tg; if (blkg_rwstat_init(&tg->stat_ios, gfp)) goto err_exit_stat_bytes; throtl_service_queue_init(&tg->service_queue); for (rw = READ; rw <= WRITE; rw++) { throtl_qnode_init(&tg->qnode_on_self[rw], tg); throtl_qnode_init(&tg->qnode_on_parent[rw], tg); } RB_CLEAR_NODE(&tg->rb_node); tg->bps[READ] = U64_MAX; tg->bps[WRITE] = U64_MAX; tg->iops[READ] = UINT_MAX; tg->iops[WRITE] = UINT_MAX; return &tg->pd; err_exit_stat_bytes: blkg_rwstat_exit(&tg->stat_bytes); err_free_tg: kfree(tg); return NULL; } static void throtl_pd_init(struct blkg_policy_data *pd) { struct throtl_grp *tg = pd_to_tg(pd); struct blkcg_gq *blkg = tg_to_blkg(tg); struct throtl_data *td = blkg->q->td; struct throtl_service_queue *sq = &tg->service_queue; /* * If on the default hierarchy, we switch to properly hierarchical * behavior where limits on a given throtl_grp are applied to the * whole subtree rather than just the group itself. e.g. If 16M * read_bps limit is set on a parent group, summary bps of * parent group and its subtree groups can't exceed 16M for the * device. * * If not on the default hierarchy, the broken flat hierarchy * behavior is retained where all throtl_grps are treated as if * they're all separate root groups right below throtl_data. * Limits of a group don't interact with limits of other groups * regardless of the position of the group in the hierarchy. */ sq->parent_sq = &td->service_queue; if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent) sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; tg->td = td; } /* * Set has_rules[] if @tg or any of its parents have limits configured. * This doesn't require walking up to the top of the hierarchy as the * parent's has_rules[] is guaranteed to be correct. */ static void tg_update_has_rules(struct throtl_grp *tg) { struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); int rw; for (rw = READ; rw <= WRITE; rw++) { tg->has_rules_iops[rw] = (parent_tg && parent_tg->has_rules_iops[rw]) || tg_iops_limit(tg, rw) != UINT_MAX; tg->has_rules_bps[rw] = (parent_tg && parent_tg->has_rules_bps[rw]) || tg_bps_limit(tg, rw) != U64_MAX; } } static void throtl_pd_online(struct blkg_policy_data *pd) { struct throtl_grp *tg = pd_to_tg(pd); /* * We don't want new groups to escape the limits of its ancestors. * Update has_rules[] after a new group is brought online. */ tg_update_has_rules(tg); } static void throtl_pd_free(struct blkg_policy_data *pd) { struct throtl_grp *tg = pd_to_tg(pd); del_timer_sync(&tg->service_queue.pending_timer); blkg_rwstat_exit(&tg->stat_bytes); blkg_rwstat_exit(&tg->stat_ios); kfree(tg); } static struct throtl_grp * throtl_rb_first(struct throtl_service_queue *parent_sq) { struct rb_node *n; n = rb_first_cached(&parent_sq->pending_tree); WARN_ON_ONCE(!n); if (!n) return NULL; return rb_entry_tg(n); } static void throtl_rb_erase(struct rb_node *n, struct throtl_service_queue *parent_sq) { rb_erase_cached(n, &parent_sq->pending_tree); RB_CLEAR_NODE(n); } static void update_min_dispatch_time(struct throtl_service_queue *parent_sq) { struct throtl_grp *tg; tg = throtl_rb_first(parent_sq); if (!tg) return; parent_sq->first_pending_disptime = tg->disptime; } static void tg_service_queue_add(struct throtl_grp *tg) { struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq; struct rb_node **node = &parent_sq->pending_tree.rb_root.rb_node; struct rb_node *parent = NULL; struct throtl_grp *__tg; unsigned long key = tg->disptime; bool leftmost = true; while (*node != NULL) { parent = *node; __tg = rb_entry_tg(parent); if (time_before(key, __tg->disptime)) node = &parent->rb_left; else { node = &parent->rb_right; leftmost = false; } } rb_link_node(&tg->rb_node, parent, node); rb_insert_color_cached(&tg->rb_node, &parent_sq->pending_tree, leftmost); } static void throtl_enqueue_tg(struct throtl_grp *tg) { if (!(tg->flags & THROTL_TG_PENDING)) { tg_service_queue_add(tg); tg->flags |= THROTL_TG_PENDING; tg->service_queue.parent_sq->nr_pending++; } } static void throtl_dequeue_tg(struct throtl_grp *tg) { if (tg->flags & THROTL_TG_PENDING) { struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq; throtl_rb_erase(&tg->rb_node, parent_sq); --parent_sq->nr_pending; tg->flags &= ~THROTL_TG_PENDING; } } /* Call with queue lock held */ static void throtl_schedule_pending_timer(struct throtl_service_queue *sq, unsigned long expires) { unsigned long max_expire = jiffies + 8 * sq_to_td(sq)->throtl_slice; /* * Since we are adjusting the throttle limit dynamically, the sleep * time calculated according to previous limit might be invalid. It's * possible the cgroup sleep time is very long and no other cgroups * have IO running so notify the limit changes. Make sure the cgroup * doesn't sleep too long to avoid the missed notification. */ if (time_after(expires, max_expire)) expires = max_expire; mod_timer(&sq->pending_timer, expires); throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu", expires - jiffies, jiffies); } /** * throtl_schedule_next_dispatch - schedule the next dispatch cycle * @sq: the service_queue to schedule dispatch for * @force: force scheduling * * Arm @sq->pending_timer so that the next dispatch cycle starts on the * dispatch time of the first pending child. Returns %true if either timer * is armed or there's no pending child left. %false if the current * dispatch window is still open and the caller should continue * dispatching. * * If @force is %true, the dispatch timer is always scheduled and this * function is guaranteed to return %true. This is to be used when the * caller can't dispatch itself and needs to invoke pending_timer * unconditionally. Note that forced scheduling is likely to induce short * delay before dispatch starts even if @sq->first_pending_disptime is not * in the future and thus shouldn't be used in hot paths. */ static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq, bool force) { /* any pending children left? */ if (!sq->nr_pending) return true; update_min_dispatch_time(sq); /* is the next dispatch time in the future? */ if (force || time_after(sq->first_pending_disptime, jiffies)) { throtl_schedule_pending_timer(sq, sq->first_pending_disptime); return true; } /* tell the caller to continue dispatching */ return false; } static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, bool rw, unsigned long start) { tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; tg->carryover_bytes[rw] = 0; tg->carryover_ios[rw] = 0; /* * Previous slice has expired. We must have trimmed it after last * bio dispatch. That means since start of last slice, we never used * that bandwidth. Do try to make use of that bandwidth while giving * credit. */ if (time_after(start, tg->slice_start[rw])) tg->slice_start[rw] = start; tg->slice_end[rw] = jiffies + tg->td->throtl_slice; throtl_log(&tg->service_queue, "[%c] new slice with credit start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], tg->slice_end[rw], jiffies); } static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw, bool clear_carryover) { tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; tg->slice_start[rw] = jiffies; tg->slice_end[rw] = jiffies + tg->td->throtl_slice; if (clear_carryover) { tg->carryover_bytes[rw] = 0; tg->carryover_ios[rw] = 0; } throtl_log(&tg->service_queue, "[%c] new slice start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], tg->slice_end[rw], jiffies); } static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, unsigned long jiffy_end) { tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice); } static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, unsigned long jiffy_end) { throtl_set_slice_end(tg, rw, jiffy_end); throtl_log(&tg->service_queue, "[%c] extend slice start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], tg->slice_end[rw], jiffies); } /* Determine if previously allocated or extended slice is complete or not */ static bool throtl_slice_used(struct throtl_grp *tg, bool rw) { if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) return false; return true; } static unsigned int calculate_io_allowed(u32 iops_limit, unsigned long jiffy_elapsed) { unsigned int io_allowed; u64 tmp; /* * jiffy_elapsed should not be a big value as minimum iops can be * 1 then at max jiffy elapsed should be equivalent of 1 second as we * will allow dispatch after 1 second and after that slice should * have been trimmed. */ tmp = (u64)iops_limit * jiffy_elapsed; do_div(tmp, HZ); if (tmp > UINT_MAX) io_allowed = UINT_MAX; else io_allowed = tmp; return io_allowed; } static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed) { /* * Can result be wider than 64 bits? * We check against 62, not 64, due to ilog2 truncation. */ if (ilog2(bps_limit) + ilog2(jiffy_elapsed) - ilog2(HZ) > 62) return U64_MAX; return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ); } /* Trim the used slices and adjust slice start accordingly */ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) { unsigned long time_elapsed; long long bytes_trim; int io_trim; BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); /* * If bps are unlimited (-1), then time slice don't get * renewed. Don't try to trim the slice if slice is used. A new * slice will start when appropriate. */ if (throtl_slice_used(tg, rw)) return; /* * A bio has been dispatched. Also adjust slice_end. It might happen * that initially cgroup limit was very low resulting in high * slice_end, but later limit was bumped up and bio was dispatched * sooner, then we need to reduce slice_end. A high bogus slice_end * is bad because it does not allow new slice to start. */ throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice); time_elapsed = rounddown(jiffies - tg->slice_start[rw], tg->td->throtl_slice); if (!time_elapsed) return; bytes_trim = calculate_bytes_allowed(tg_bps_limit(tg, rw), time_elapsed) + tg->carryover_bytes[rw]; io_trim = calculate_io_allowed(tg_iops_limit(tg, rw), time_elapsed) + tg->carryover_ios[rw]; if (bytes_trim <= 0 && io_trim <= 0) return; tg->carryover_bytes[rw] = 0; if ((long long)tg->bytes_disp[rw] >= bytes_trim) tg->bytes_disp[rw] -= bytes_trim; else tg->bytes_disp[rw] = 0; tg->carryover_ios[rw] = 0; if ((int)tg->io_disp[rw] >= io_trim) tg->io_disp[rw] -= io_trim; else tg->io_disp[rw] = 0; tg->slice_start[rw] += time_elapsed; throtl_log(&tg->service_queue, "[%c] trim slice nr=%lu bytes=%lld io=%d start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', time_elapsed / tg->td->throtl_slice, bytes_trim, io_trim, tg->slice_start[rw], tg->slice_end[rw], jiffies); } static void __tg_update_carryover(struct throtl_grp *tg, bool rw) { unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw]; u64 bps_limit = tg_bps_limit(tg, rw); u32 iops_limit = tg_iops_limit(tg, rw); /* * If config is updated while bios are still throttled, calculate and * accumulate how many bytes/ios are waited across changes. And * carryover_bytes/ios will be used to calculate new wait time under new * configuration. */ if (bps_limit != U64_MAX) tg->carryover_bytes[rw] += calculate_bytes_allowed(bps_limit, jiffy_elapsed) - tg->bytes_disp[rw]; if (iops_limit != UINT_MAX) tg->carryover_ios[rw] += calculate_io_allowed(iops_limit, jiffy_elapsed) - tg->io_disp[rw]; } static void tg_update_carryover(struct throtl_grp *tg) { if (tg->service_queue.nr_queued[READ]) __tg_update_carryover(tg, READ); if (tg->service_queue.nr_queued[WRITE]) __tg_update_carryover(tg, WRITE); /* see comments in struct throtl_grp for meaning of these fields. */ throtl_log(&tg->service_queue, "%s: %lld %lld %d %d\n", __func__, tg->carryover_bytes[READ], tg->carryover_bytes[WRITE], tg->carryover_ios[READ], tg->carryover_ios[WRITE]); } static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio, u32 iops_limit) { bool rw = bio_data_dir(bio); int io_allowed; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; if (iops_limit == UINT_MAX) { return 0; } jiffy_elapsed = jiffies - tg->slice_start[rw]; /* Round up to the next throttle slice, wait time must be nonzero */ jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice); io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd) + tg->carryover_ios[rw]; if (io_allowed > 0 && tg->io_disp[rw] + 1 <= io_allowed) return 0; /* Calc approx time to dispatch */ jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed; /* make sure at least one io can be dispatched after waiting */ jiffy_wait = max(jiffy_wait, HZ / iops_limit + 1); return jiffy_wait; } static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, u64 bps_limit) { bool rw = bio_data_dir(bio); long long bytes_allowed; u64 extra_bytes; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; unsigned int bio_size = throtl_bio_data_size(bio); /* no need to throttle if this bio's bytes have been accounted */ if (bps_limit == U64_MAX || bio_flagged(bio, BIO_BPS_THROTTLED)) { return 0; } jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; /* Slice has just started. Consider one slice interval */ if (!jiffy_elapsed) jiffy_elapsed_rnd = tg->td->throtl_slice; jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd) + tg->carryover_bytes[rw]; if (bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed) return 0; /* Calc approx time to dispatch */ extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed; jiffy_wait = div64_u64(extra_bytes * HZ, bps_limit); if (!jiffy_wait) jiffy_wait = 1; /* * This wait time is without taking into consideration the rounding * up we did. Add that time also. */ jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); return jiffy_wait; } /* * Returns whether one can dispatch a bio or not. Also returns approx number * of jiffies to wait before this bio is with-in IO rate and can be dispatched */ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, unsigned long *wait) { bool rw = bio_data_dir(bio); unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; u64 bps_limit = tg_bps_limit(tg, rw); u32 iops_limit = tg_iops_limit(tg, rw); /* * Currently whole state machine of group depends on first bio * queued in the group bio list. So one should not be calling * this function with a different bio if there are other bios * queued. */ BUG_ON(tg->service_queue.nr_queued[rw] && bio != throtl_peek_queued(&tg->service_queue.queued[rw])); /* If tg->bps = -1, then BW is unlimited */ if ((bps_limit == U64_MAX && iops_limit == UINT_MAX) || tg->flags & THROTL_TG_CANCELING) { if (wait) *wait = 0; return true; } /* * If previous slice expired, start a new one otherwise renew/extend * existing slice to make sure it is at least throtl_slice interval * long since now. New slice is started only for empty throttle group. * If there is queued bio, that means there should be an active * slice and it should be extended instead. */ if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw])) throtl_start_new_slice(tg, rw, true); else { if (time_before(tg->slice_end[rw], jiffies + tg->td->throtl_slice)) throtl_extend_slice(tg, rw, jiffies + tg->td->throtl_slice); } bps_wait = tg_within_bps_limit(tg, bio, bps_limit); iops_wait = tg_within_iops_limit(tg, bio, iops_limit); if (bps_wait + iops_wait == 0) { if (wait) *wait = 0; return true; } max_wait = max(bps_wait, iops_wait); if (wait) *wait = max_wait; if (time_before(tg->slice_end[rw], jiffies + max_wait)) throtl_extend_slice(tg, rw, jiffies + max_wait); return false; } static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) { bool rw = bio_data_dir(bio); unsigned int bio_size = throtl_bio_data_size(bio); /* Charge the bio to the group */ if (!bio_flagged(bio, BIO_BPS_THROTTLED)) { tg->bytes_disp[rw] += bio_size; tg->last_bytes_disp[rw] += bio_size; } tg->io_disp[rw]++; tg->last_io_disp[rw]++; } /** * throtl_add_bio_tg - add a bio to the specified throtl_grp * @bio: bio to add * @qn: qnode to use * @tg: the target throtl_grp * * Add @bio to @tg's service_queue using @qn. If @qn is not specified, * tg->qnode_on_self[] is used. */ static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn, struct throtl_grp *tg) { struct throtl_service_queue *sq = &tg->service_queue; bool rw = bio_data_dir(bio); if (!qn) qn = &tg->qnode_on_self[rw]; /* * If @tg doesn't currently have any bios queued in the same * direction, queueing @bio can change when @tg should be * dispatched. Mark that @tg was empty. This is automatically * cleared on the next tg_update_disptime(). */ if (!sq->nr_queued[rw]) tg->flags |= THROTL_TG_WAS_EMPTY; throtl_qnode_add_bio(bio, qn, &sq->queued[rw]); sq->nr_queued[rw]++; throtl_enqueue_tg(tg); } static void tg_update_disptime(struct throtl_grp *tg) { struct throtl_service_queue *sq = &tg->service_queue; unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; struct bio *bio; bio = throtl_peek_queued(&sq->queued[READ]); if (bio) tg_may_dispatch(tg, bio, &read_wait); bio = throtl_peek_queued(&sq->queued[WRITE]); if (bio) tg_may_dispatch(tg, bio, &write_wait); min_wait = min(read_wait, write_wait); disptime = jiffies + min_wait; /* Update dispatch time */ throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq); tg->disptime = disptime; tg_service_queue_add(tg); /* see throtl_add_bio_tg() */ tg->flags &= ~THROTL_TG_WAS_EMPTY; } static void start_parent_slice_with_credit(struct throtl_grp *child_tg, struct throtl_grp *parent_tg, bool rw) { if (throtl_slice_used(parent_tg, rw)) { throtl_start_new_slice_with_credit(parent_tg, rw, child_tg->slice_start[rw]); } } static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) { struct throtl_service_queue *sq = &tg->service_queue; struct throtl_service_queue *parent_sq = sq->parent_sq; struct throtl_grp *parent_tg = sq_to_tg(parent_sq); struct throtl_grp *tg_to_put = NULL; struct bio *bio; /* * @bio is being transferred from @tg to @parent_sq. Popping a bio * from @tg may put its reference and @parent_sq might end up * getting released prematurely. Remember the tg to put and put it * after @bio is transferred to @parent_sq. */ bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put); sq->nr_queued[rw]--; throtl_charge_bio(tg, bio); /* * If our parent is another tg, we just need to transfer @bio to * the parent using throtl_add_bio_tg(). If our parent is * @td->service_queue, @bio is ready to be issued. Put it on its * bio_lists[] and decrease total number queued. The caller is * responsible for issuing these bios. */ if (parent_tg) { throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg); start_parent_slice_with_credit(tg, parent_tg, rw); } else { bio_set_flag(bio, BIO_BPS_THROTTLED); throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw], &parent_sq->queued[rw]); BUG_ON(tg->td->nr_queued[rw] <= 0); tg->td->nr_queued[rw]--; } throtl_trim_slice(tg, rw); if (tg_to_put) blkg_put(tg_to_blkg(tg_to_put)); } static int throtl_dispatch_tg(struct throtl_grp *tg) { struct throtl_service_queue *sq = &tg->service_queue; unsigned int nr_reads = 0, nr_writes = 0; unsigned int max_nr_reads = THROTL_GRP_QUANTUM * 3 / 4; unsigned int max_nr_writes = THROTL_GRP_QUANTUM - max_nr_reads; struct bio *bio; /* Try to dispatch 75% READS and 25% WRITES */ while ((bio = throtl_peek_queued(&sq->queued[READ])) && tg_may_dispatch(tg, bio, NULL)) { tg_dispatch_one_bio(tg, READ); nr_reads++; if (nr_reads >= max_nr_reads) break; } while ((bio = throtl_peek_queued(&sq->queued[WRITE])) && tg_may_dispatch(tg, bio, NULL)) { tg_dispatch_one_bio(tg, WRITE); nr_writes++; if (nr_writes >= max_nr_writes) break; } return nr_reads + nr_writes; } static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) { unsigned int nr_disp = 0; while (1) { struct throtl_grp *tg; struct throtl_service_queue *sq; if (!parent_sq->nr_pending) break; tg = throtl_rb_first(parent_sq); if (!tg) break; if (time_before(jiffies, tg->disptime)) break; nr_disp += throtl_dispatch_tg(tg); sq = &tg->service_queue; if (sq->nr_queued[READ] || sq->nr_queued[WRITE]) tg_update_disptime(tg); else throtl_dequeue_tg(tg); if (nr_disp >= THROTL_QUANTUM) break; } return nr_disp; } /** * throtl_pending_timer_fn - timer function for service_queue->pending_timer * @t: the pending_timer member of the throtl_service_queue being serviced * * This timer is armed when a child throtl_grp with active bio's become * pending and queued on the service_queue's pending_tree and expires when * the first child throtl_grp should be dispatched. This function * dispatches bio's from the children throtl_grps to the parent * service_queue. * * If the parent's parent is another throtl_grp, dispatching is propagated * by either arming its pending_timer or repeating dispatch directly. If * the top-level service_tree is reached, throtl_data->dispatch_work is * kicked so that the ready bio's are issued. */ static void throtl_pending_timer_fn(struct timer_list *t) { struct throtl_service_queue *sq = from_timer(sq, t, pending_timer); struct throtl_grp *tg = sq_to_tg(sq); struct throtl_data *td = sq_to_td(sq); struct throtl_service_queue *parent_sq; struct request_queue *q; bool dispatched; int ret; /* throtl_data may be gone, so figure out request queue by blkg */ if (tg) q = tg->pd.blkg->q; else q = td->queue; spin_lock_irq(&q->queue_lock); if (!q->root_blkg) goto out_unlock; again: parent_sq = sq->parent_sq; dispatched = false; while (true) { throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u", sq->nr_queued[READ] + sq->nr_queued[WRITE], sq->nr_queued[READ], sq->nr_queued[WRITE]); ret = throtl_select_dispatch(sq); if (ret) { throtl_log(sq, "bios disp=%u", ret); dispatched = true; } if (throtl_schedule_next_dispatch(sq, false)) break; /* this dispatch windows is still open, relax and repeat */ spin_unlock_irq(&q->queue_lock); cpu_relax(); spin_lock_irq(&q->queue_lock); } if (!dispatched) goto out_unlock; if (parent_sq) { /* @parent_sq is another throl_grp, propagate dispatch */ if (tg->flags & THROTL_TG_WAS_EMPTY) { tg_update_disptime(tg); if (!throtl_schedule_next_dispatch(parent_sq, false)) { /* window is already open, repeat dispatching */ sq = parent_sq; tg = sq_to_tg(sq); goto again; } } } else { /* reached the top-level, queue issuing */ queue_work(kthrotld_workqueue, &td->dispatch_work); } out_unlock: spin_unlock_irq(&q->queue_lock); } /** * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work * @work: work item being executed * * This function is queued for execution when bios reach the bio_lists[] * of throtl_data->service_queue. Those bios are ready and issued by this * function. */ static void blk_throtl_dispatch_work_fn(struct work_struct *work) { struct throtl_data *td = container_of(work, struct throtl_data, dispatch_work); struct throtl_service_queue *td_sq = &td->service_queue; struct request_queue *q = td->queue; struct bio_list bio_list_on_stack; struct bio *bio; struct blk_plug plug; int rw; bio_list_init(&bio_list_on_stack); spin_lock_irq(&q->queue_lock); for (rw = READ; rw <= WRITE; rw++) while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL))) bio_list_add(&bio_list_on_stack, bio); spin_unlock_irq(&q->queue_lock); if (!bio_list_empty(&bio_list_on_stack)) { blk_start_plug(&plug); while ((bio = bio_list_pop(&bio_list_on_stack))) submit_bio_noacct_nocheck(bio); blk_finish_plug(&plug); } } static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct throtl_grp *tg = pd_to_tg(pd); u64 v = *(u64 *)((void *)tg + off); if (v == U64_MAX) return 0; return __blkg_prfill_u64(sf, pd, v); } static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct throtl_grp *tg = pd_to_tg(pd); unsigned int v = *(unsigned int *)((void *)tg + off); if (v == UINT_MAX) return 0; return __blkg_prfill_u64(sf, pd, v); } static int tg_print_conf_u64(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64, &blkcg_policy_throtl, seq_cft(sf)->private, false); return 0; } static int tg_print_conf_uint(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint, &blkcg_policy_throtl, seq_cft(sf)->private, false); return 0; } static void tg_conf_updated(struct throtl_grp *tg, bool global) { struct throtl_service_queue *sq = &tg->service_queue; struct cgroup_subsys_state *pos_css; struct blkcg_gq *blkg; throtl_log(&tg->service_queue, "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", tg_bps_limit(tg, READ), tg_bps_limit(tg, WRITE), tg_iops_limit(tg, READ), tg_iops_limit(tg, WRITE)); rcu_read_lock(); /* * Update has_rules[] flags for the updated tg's subtree. A tg is * considered to have rules if either the tg itself or any of its * ancestors has rules. This identifies groups without any * restrictions in the whole hierarchy and allows them to bypass * blk-throttle. */ blkg_for_each_descendant_pre(blkg, pos_css, global ? tg->td->queue->root_blkg : tg_to_blkg(tg)) { struct throtl_grp *this_tg = blkg_to_tg(blkg); tg_update_has_rules(this_tg); /* ignore root/second level */ if (!cgroup_subsys_on_dfl(io_cgrp_subsys) || !blkg->parent || !blkg->parent->parent) continue; } rcu_read_unlock(); /* * We're already holding queue_lock and know @tg is valid. Let's * apply the new config directly. * * Restart the slices for both READ and WRITES. It might happen * that a group's limit are dropped suddenly and we don't want to * account recently dispatched IO with new low rate. */ throtl_start_new_slice(tg, READ, false); throtl_start_new_slice(tg, WRITE, false); if (tg->flags & THROTL_TG_PENDING) { tg_update_disptime(tg); throtl_schedule_next_dispatch(sq->parent_sq, true); } } static int blk_throtl_init(struct gendisk *disk) { struct request_queue *q = disk->queue; struct throtl_data *td; int ret; td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); if (!td) return -ENOMEM; INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); throtl_service_queue_init(&td->service_queue); /* * Freeze queue before activating policy, to synchronize with IO path, * which is protected by 'q_usage_counter'. */ blk_mq_freeze_queue(disk->queue); blk_mq_quiesce_queue(disk->queue); q->td = td; td->queue = q; /* activate policy */ ret = blkcg_activate_policy(disk, &blkcg_policy_throtl); if (ret) { q->td = NULL; kfree(td); goto out; } if (blk_queue_nonrot(q)) td->throtl_slice = DFL_THROTL_SLICE_SSD; else td->throtl_slice = DFL_THROTL_SLICE_HD; td->track_bio_latency = !queue_is_mq(q); if (!td->track_bio_latency) blk_stat_enable_accounting(q); out: blk_mq_unquiesce_queue(disk->queue); blk_mq_unfreeze_queue(disk->queue); return ret; } static ssize_t tg_set_conf(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, bool is_u64) { struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct blkg_conf_ctx ctx; struct throtl_grp *tg; int ret; u64 v; blkg_conf_init(&ctx, buf); ret = blkg_conf_open_bdev(&ctx); if (ret) goto out_finish; if (!blk_throtl_activated(ctx.bdev->bd_queue)) { ret = blk_throtl_init(ctx.bdev->bd_disk); if (ret) goto out_finish; } ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); if (ret) goto out_finish; ret = -EINVAL; if (sscanf(ctx.body, "%llu", &v) != 1) goto out_finish; if (!v) v = U64_MAX; tg = blkg_to_tg(ctx.blkg); tg_update_carryover(tg); if (is_u64) *(u64 *)((void *)tg + of_cft(of)->private) = v; else *(unsigned int *)((void *)tg + of_cft(of)->private) = v; tg_conf_updated(tg, false); ret = 0; out_finish: blkg_conf_exit(&ctx); return ret ?: nbytes; } static ssize_t tg_set_conf_u64(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return tg_set_conf(of, buf, nbytes, off, true); } static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return tg_set_conf(of, buf, nbytes, off, false); } static int tg_print_rwstat(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, &blkcg_policy_throtl, seq_cft(sf)->private, true); return 0; } static u64 tg_prfill_rwstat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct blkg_rwstat_sample sum; blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_throtl, off, &sum); return __blkg_prfill_rwstat(sf, pd, &sum); } static int tg_print_rwstat_recursive(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_rwstat_recursive, &blkcg_policy_throtl, seq_cft(sf)->private, true); return 0; } static struct cftype throtl_legacy_files[] = { { .name = "throttle.read_bps_device", .private = offsetof(struct throtl_grp, bps[READ]), .seq_show = tg_print_conf_u64, .write = tg_set_conf_u64, }, { .name = "throttle.write_bps_device", .private = offsetof(struct throtl_grp, bps[WRITE]), .seq_show = tg_print_conf_u64, .write = tg_set_conf_u64, }, { .name = "throttle.read_iops_device", .private = offsetof(struct throtl_grp, iops[READ]), .seq_show = tg_print_conf_uint, .write = tg_set_conf_uint, }, { .name = "throttle.write_iops_device", .private = offsetof(struct throtl_grp, iops[WRITE]), .seq_show = tg_print_conf_uint, .write = tg_set_conf_uint, }, { .name = "throttle.io_service_bytes", .private = offsetof(struct throtl_grp, stat_bytes), .seq_show = tg_print_rwstat, }, { .name = "throttle.io_service_bytes_recursive", .private = offsetof(struct throtl_grp, stat_bytes), .seq_show = tg_print_rwstat_recursive, }, { .name = "throttle.io_serviced", .private = offsetof(struct throtl_grp, stat_ios), .seq_show = tg_print_rwstat, }, { .name = "throttle.io_serviced_recursive", .private = offsetof(struct throtl_grp, stat_ios), .seq_show = tg_print_rwstat_recursive, }, { } /* terminate */ }; static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct throtl_grp *tg = pd_to_tg(pd); const char *dname = blkg_dev_name(pd->blkg); u64 bps_dft; unsigned int iops_dft; if (!dname) return 0; bps_dft = U64_MAX; iops_dft = UINT_MAX; if (tg->bps[READ] == bps_dft && tg->bps[WRITE] == bps_dft && tg->iops[READ] == iops_dft && tg->iops[WRITE] == iops_dft) return 0; seq_printf(sf, "%s", dname); if (tg->bps[READ] == U64_MAX) seq_printf(sf, " rbps=max"); else seq_printf(sf, " rbps=%llu", tg->bps[READ]); if (tg->bps[WRITE] == U64_MAX) seq_printf(sf, " wbps=max"); else seq_printf(sf, " wbps=%llu", tg->bps[WRITE]); if (tg->iops[READ] == UINT_MAX) seq_printf(sf, " riops=max"); else seq_printf(sf, " riops=%u", tg->iops[READ]); if (tg->iops[WRITE] == UINT_MAX) seq_printf(sf, " wiops=max"); else seq_printf(sf, " wiops=%u", tg->iops[WRITE]); seq_printf(sf, "\n"); return 0; } static int tg_print_limit(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_limit, &blkcg_policy_throtl, seq_cft(sf)->private, false); return 0; } static ssize_t tg_set_limit(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct blkg_conf_ctx ctx; struct throtl_grp *tg; u64 v[4]; int ret; blkg_conf_init(&ctx, buf); ret = blkg_conf_open_bdev(&ctx); if (ret) goto out_finish; if (!blk_throtl_activated(ctx.bdev->bd_queue)) { ret = blk_throtl_init(ctx.bdev->bd_disk); if (ret) goto out_finish; } ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); if (ret) goto out_finish; tg = blkg_to_tg(ctx.blkg); tg_update_carryover(tg); v[0] = tg->bps[READ]; v[1] = tg->bps[WRITE]; v[2] = tg->iops[READ]; v[3] = tg->iops[WRITE]; while (true) { char tok[27]; /* wiops=18446744073709551616 */ char *p; u64 val = U64_MAX; int len; if (sscanf(ctx.body, "%26s%n", tok, &len) != 1) break; if (tok[0] == '\0') break; ctx.body += len; ret = -EINVAL; p = tok; strsep(&p, "="); if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max"))) goto out_finish; ret = -ERANGE; if (!val) goto out_finish; ret = -EINVAL; if (!strcmp(tok, "rbps")) v[0] = val; else if (!strcmp(tok, "wbps")) v[1] = val; else if (!strcmp(tok, "riops")) v[2] = min_t(u64, val, UINT_MAX); else if (!strcmp(tok, "wiops")) v[3] = min_t(u64, val, UINT_MAX); else goto out_finish; } tg->bps[READ] = v[0]; tg->bps[WRITE] = v[1]; tg->iops[READ] = v[2]; tg->iops[WRITE] = v[3]; tg_conf_updated(tg, false); ret = 0; out_finish: blkg_conf_exit(&ctx); return ret ?: nbytes; } static struct cftype throtl_files[] = { { .name = "max", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = tg_print_limit, .write = tg_set_limit, }, { } /* terminate */ }; static void throtl_shutdown_wq(struct request_queue *q) { struct throtl_data *td = q->td; cancel_work_sync(&td->dispatch_work); } static void tg_flush_bios(struct throtl_grp *tg) { struct throtl_service_queue *sq = &tg->service_queue; if (tg->flags & THROTL_TG_CANCELING) return; /* * Set the flag to make sure throtl_pending_timer_fn() won't * stop until all throttled bios are dispatched. */ tg->flags |= THROTL_TG_CANCELING; /* * Do not dispatch cgroup without THROTL_TG_PENDING or cgroup * will be inserted to service queue without THROTL_TG_PENDING * set in tg_update_disptime below. Then IO dispatched from * child in tg_dispatch_one_bio will trigger double insertion * and corrupt the tree. */ if (!(tg->flags & THROTL_TG_PENDING)) return; /* * Update disptime after setting the above flag to make sure * throtl_select_dispatch() won't exit without dispatching. */ tg_update_disptime(tg); throtl_schedule_pending_timer(sq, jiffies + 1); } static void throtl_pd_offline(struct blkg_policy_data *pd) { tg_flush_bios(pd_to_tg(pd)); } struct blkcg_policy blkcg_policy_throtl = { .dfl_cftypes = throtl_files, .legacy_cftypes = throtl_legacy_files, .pd_alloc_fn = throtl_pd_alloc, .pd_init_fn = throtl_pd_init, .pd_online_fn = throtl_pd_online, .pd_offline_fn = throtl_pd_offline, .pd_free_fn = throtl_pd_free, }; void blk_throtl_cancel_bios(struct gendisk *disk) { struct request_queue *q = disk->queue; struct cgroup_subsys_state *pos_css; struct blkcg_gq *blkg; if (!blk_throtl_activated(q)) return; spin_lock_irq(&q->queue_lock); /* * queue_lock is held, rcu lock is not needed here technically. * However, rcu lock is still held to emphasize that following * path need RCU protection and to prevent warning from lockdep. */ rcu_read_lock(); blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { /* * disk_release will call pd_offline_fn to cancel bios. * However, disk_release can't be called if someone get * the refcount of device and issued bios which are * inflight after del_gendisk. * Cancel bios here to ensure no bios are inflight after * del_gendisk. */ tg_flush_bios(blkg_to_tg(blkg)); } rcu_read_unlock(); spin_unlock_irq(&q->queue_lock); } static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw) { /* throtl is FIFO - if bios are already queued, should queue */ if (tg->service_queue.nr_queued[rw]) return false; return tg_may_dispatch(tg, bio, NULL); } static void tg_dispatch_in_debt(struct throtl_grp *tg, struct bio *bio, bool rw) { if (!bio_flagged(bio, BIO_BPS_THROTTLED)) tg->carryover_bytes[rw] -= throtl_bio_data_size(bio); tg->carryover_ios[rw]--; } bool __blk_throtl_bio(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); struct blkcg_gq *blkg = bio->bi_blkg; struct throtl_qnode *qn = NULL; struct throtl_grp *tg = blkg_to_tg(blkg); struct throtl_service_queue *sq; bool rw = bio_data_dir(bio); bool throttled = false; struct throtl_data *td = tg->td; rcu_read_lock(); spin_lock_irq(&q->queue_lock); sq = &tg->service_queue; while (true) { if (tg_within_limit(tg, bio, rw)) { /* within limits, let's charge and dispatch directly */ throtl_charge_bio(tg, bio); /* * We need to trim slice even when bios are not being * queued otherwise it might happen that a bio is not * queued for a long time and slice keeps on extending * and trim is not called for a long time. Now if limits * are reduced suddenly we take into account all the IO * dispatched so far at new low rate and * newly queued * IO gets a really long dispatch time. * * So keep on trimming slice even if bio is not queued. */ throtl_trim_slice(tg, rw); } else if (bio_issue_as_root_blkg(bio)) { /* * IOs which may cause priority inversions are * dispatched directly, even if they're over limit. * Debts are handled by carryover_bytes/ios while * calculating wait time. */ tg_dispatch_in_debt(tg, bio, rw); } else { /* if above limits, break to queue */ break; } /* * @bio passed through this layer without being throttled. * Climb up the ladder. If we're already at the top, it * can be executed directly. */ qn = &tg->qnode_on_parent[rw]; sq = sq->parent_sq; tg = sq_to_tg(sq); if (!tg) { bio_set_flag(bio, BIO_BPS_THROTTLED); goto out_unlock; } } /* out-of-limit, queue to @tg */ throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d", rw == READ ? 'R' : 'W', tg->bytes_disp[rw], bio->bi_iter.bi_size, tg_bps_limit(tg, rw), tg->io_disp[rw], tg_iops_limit(tg, rw), sq->nr_queued[READ], sq->nr_queued[WRITE]); td->nr_queued[rw]++; throtl_add_bio_tg(bio, qn, tg); throttled = true; /* * Update @tg's dispatch time and force schedule dispatch if @tg * was empty before @bio. The forced scheduling isn't likely to * cause undue delay as @bio is likely to be dispatched directly if * its @tg's disptime is not in the future. */ if (tg->flags & THROTL_TG_WAS_EMPTY) { tg_update_disptime(tg); throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true); } out_unlock: spin_unlock_irq(&q->queue_lock); rcu_read_unlock(); return throttled; } void blk_throtl_exit(struct gendisk *disk) { struct request_queue *q = disk->queue; if (!blk_throtl_activated(q)) return; del_timer_sync(&q->td->service_queue.pending_timer); throtl_shutdown_wq(q); blkcg_deactivate_policy(disk, &blkcg_policy_throtl); kfree(q->td); } static int __init throtl_init(void) { kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0); if (!kthrotld_workqueue) panic("Failed to create kthrotld\n"); return blkcg_policy_register(&blkcg_policy_throtl); } module_init(throtl_init); |
| 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 | // SPDX-License-Identifier: GPL-2.0 /* * Creating audit events from TTY input. * * Copyright (C) 2007 Red Hat, Inc. All rights reserved. * * Authors: Miloslav Trmac <mitr@redhat.com> */ #include <linux/audit.h> #include <linux/slab.h> #include <linux/tty.h> #include "tty.h" struct tty_audit_buf { struct mutex mutex; /* Protects all data below */ dev_t dev; /* The TTY which the data is from */ bool icanon; size_t valid; u8 *data; /* Allocated size N_TTY_BUF_SIZE */ }; static struct tty_audit_buf *tty_audit_buf_ref(void) { struct tty_audit_buf *buf; buf = current->signal->tty_audit_buf; WARN_ON(buf == ERR_PTR(-ESRCH)); return buf; } static struct tty_audit_buf *tty_audit_buf_alloc(void) { struct tty_audit_buf *buf; buf = kzalloc(sizeof(*buf), GFP_KERNEL); if (!buf) goto err; buf->data = kmalloc(N_TTY_BUF_SIZE, GFP_KERNEL); if (!buf->data) goto err_buf; mutex_init(&buf->mutex); return buf; err_buf: kfree(buf); err: return NULL; } static void tty_audit_buf_free(struct tty_audit_buf *buf) { WARN_ON(buf->valid != 0); kfree(buf->data); kfree(buf); } static void tty_audit_log(const char *description, dev_t dev, const u8 *data, size_t size) { struct audit_buffer *ab; pid_t pid = task_pid_nr(current); uid_t uid = from_kuid(&init_user_ns, task_uid(current)); uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current)); unsigned int sessionid = audit_get_sessionid(current); char name[TASK_COMM_LEN]; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_TTY); if (!ab) return; audit_log_format(ab, "%s pid=%u uid=%u auid=%u ses=%u major=%d minor=%d comm=", description, pid, uid, loginuid, sessionid, MAJOR(dev), MINOR(dev)); get_task_comm(name, current); audit_log_untrustedstring(ab, name); audit_log_format(ab, " data="); audit_log_n_hex(ab, data, size); audit_log_end(ab); } /* * tty_audit_buf_push - Push buffered data out * * Generate an audit message from the contents of @buf, which is owned by * the current task. @buf->mutex must be locked. */ static void tty_audit_buf_push(struct tty_audit_buf *buf) { if (buf->valid == 0) return; if (audit_enabled == AUDIT_OFF) { buf->valid = 0; return; } tty_audit_log("tty", buf->dev, buf->data, buf->valid); buf->valid = 0; } /** * tty_audit_exit - Handle a task exit * * Make sure all buffered data is written out and deallocate the buffer. * Only needs to be called if current->signal->tty_audit_buf != %NULL. * * The process is single-threaded at this point; no other threads share * current->signal. */ void tty_audit_exit(void) { struct tty_audit_buf *buf; buf = xchg(¤t->signal->tty_audit_buf, ERR_PTR(-ESRCH)); if (!buf) return; tty_audit_buf_push(buf); tty_audit_buf_free(buf); } /* * tty_audit_fork - Copy TTY audit state for a new task * * Set up TTY audit state in @sig from current. @sig needs no locking. */ void tty_audit_fork(struct signal_struct *sig) { sig->audit_tty = current->signal->audit_tty; } /* * tty_audit_tiocsti - Log TIOCSTI */ void tty_audit_tiocsti(const struct tty_struct *tty, u8 ch) { dev_t dev; dev = MKDEV(tty->driver->major, tty->driver->minor_start) + tty->index; if (tty_audit_push()) return; if (audit_enabled) tty_audit_log("ioctl=TIOCSTI", dev, &ch, 1); } /* * tty_audit_push - Flush current's pending audit data * * Returns 0 if success, -EPERM if tty audit is disabled */ int tty_audit_push(void) { struct tty_audit_buf *buf; if (~current->signal->audit_tty & AUDIT_TTY_ENABLE) return -EPERM; buf = tty_audit_buf_ref(); if (!IS_ERR_OR_NULL(buf)) { mutex_lock(&buf->mutex); tty_audit_buf_push(buf); mutex_unlock(&buf->mutex); } return 0; } /* * tty_audit_buf_get - Get an audit buffer. * * Get an audit buffer, allocate it if necessary. Return %NULL * if out of memory or ERR_PTR(-ESRCH) if tty_audit_exit() has already * occurred. Otherwise, return a new reference to the buffer. */ static struct tty_audit_buf *tty_audit_buf_get(void) { struct tty_audit_buf *buf; buf = tty_audit_buf_ref(); if (buf) return buf; buf = tty_audit_buf_alloc(); if (buf == NULL) { audit_log_lost("out of memory in TTY auditing"); return NULL; } /* Race to use this buffer, free it if another wins */ if (cmpxchg(¤t->signal->tty_audit_buf, NULL, buf) != NULL) tty_audit_buf_free(buf); return tty_audit_buf_ref(); } /* * tty_audit_add_data - Add data for TTY auditing. * * Audit @data of @size from @tty, if necessary. */ void tty_audit_add_data(const struct tty_struct *tty, const void *data, size_t size) { struct tty_audit_buf *buf; unsigned int audit_tty; bool icanon = L_ICANON(tty); dev_t dev; audit_tty = READ_ONCE(current->signal->audit_tty); if (~audit_tty & AUDIT_TTY_ENABLE) return; if (unlikely(size == 0)) return; if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER) return; if ((~audit_tty & AUDIT_TTY_LOG_PASSWD) && icanon && !L_ECHO(tty)) return; buf = tty_audit_buf_get(); if (IS_ERR_OR_NULL(buf)) return; mutex_lock(&buf->mutex); dev = MKDEV(tty->driver->major, tty->driver->minor_start) + tty->index; if (buf->dev != dev || buf->icanon != icanon) { tty_audit_buf_push(buf); buf->dev = dev; buf->icanon = icanon; } do { size_t run; run = N_TTY_BUF_SIZE - buf->valid; if (run > size) run = size; memcpy(buf->data + buf->valid, data, run); buf->valid += run; data += run; size -= run; if (buf->valid == N_TTY_BUF_SIZE) tty_audit_buf_push(buf); } while (size != 0); mutex_unlock(&buf->mutex); } |
| 31 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved. */ #ifndef OCFS2_AOPS_H #define OCFS2_AOPS_H #include <linux/fs.h> handle_t *ocfs2_start_walk_page_trans(struct inode *inode, struct page *page, unsigned from, unsigned to); int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, struct inode *inode, unsigned int from, unsigned int to, int new); void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages); int walk_page_buffers( handle_t *handle, struct buffer_head *head, unsigned from, unsigned to, int *partial, int (*fn)( handle_t *handle, struct buffer_head *bh)); int ocfs2_write_end_nolock(struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, void *fsdata); typedef enum { OCFS2_WRITE_BUFFER = 0, OCFS2_WRITE_DIRECT, OCFS2_WRITE_MMAP, } ocfs2_write_type_t; int ocfs2_write_begin_nolock(struct address_space *mapping, loff_t pos, unsigned len, ocfs2_write_type_t type, struct folio **foliop, void **fsdata, struct buffer_head *di_bh, struct page *mmap_page); int ocfs2_read_inline_data(struct inode *inode, struct page *page, struct buffer_head *di_bh); int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size); int ocfs2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); /* all ocfs2_dio_end_io()'s fault */ #define ocfs2_iocb_is_rw_locked(iocb) \ test_bit(0, (unsigned long *)&iocb->private) static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level) { set_bit(0, (unsigned long *)&iocb->private); if (level) set_bit(1, (unsigned long *)&iocb->private); else clear_bit(1, (unsigned long *)&iocb->private); } /* * Using a named enum representing lock types in terms of #N bit stored in * iocb->private, which is going to be used for communication between * ocfs2_dio_end_io() and ocfs2_file_write/read_iter(). */ enum ocfs2_iocb_lock_bits { OCFS2_IOCB_RW_LOCK = 0, OCFS2_IOCB_RW_LOCK_LEVEL, OCFS2_IOCB_NUM_LOCKS }; #define ocfs2_iocb_init_rw_locked(iocb) \ (iocb->private = NULL) #define ocfs2_iocb_clear_rw_locked(iocb) \ clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private) #define ocfs2_iocb_rw_locked_level(iocb) \ test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private) #endif /* OCFS2_FILE_H */ |
| 477 301 394 350 172 343 177 306 177 307 469 274 469 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000,2005 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_inode_item.h" #include <linux/iversion.h> /* * Add a locked inode to the transaction. * * The inode must be locked, and it cannot be associated with any transaction. * If lock_flags is non-zero the inode will be unlocked on transaction commit. */ void xfs_trans_ijoin( struct xfs_trans *tp, struct xfs_inode *ip, uint lock_flags) { struct xfs_inode_log_item *iip; xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); if (ip->i_itemp == NULL) xfs_inode_item_init(ip, ip->i_mount); iip = ip->i_itemp; ASSERT(iip->ili_lock_flags == 0); iip->ili_lock_flags = lock_flags; ASSERT(!xfs_iflags_test(ip, XFS_ISTALE)); /* Reset the per-tx dirty context and add the item to the tx. */ iip->ili_dirty_flags = 0; xfs_trans_add_item(tp, &iip->ili_item); } /* * Transactional inode timestamp update. Requires the inode to be locked and * joined to the transaction supplied. Relies on the transaction subsystem to * track dirty state and update/writeback the inode accordingly. */ void xfs_trans_ichgtime( struct xfs_trans *tp, struct xfs_inode *ip, int flags) { struct inode *inode = VFS_I(ip); struct timespec64 tv; ASSERT(tp); xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); /* If the mtime changes, then ctime must also change */ ASSERT(flags & XFS_ICHGTIME_CHG); tv = inode_set_ctime_current(inode); if (flags & XFS_ICHGTIME_MOD) inode_set_mtime_to_ts(inode, tv); if (flags & XFS_ICHGTIME_ACCESS) inode_set_atime_to_ts(inode, tv); if (flags & XFS_ICHGTIME_CREATE) ip->i_crtime = tv; } /* * This is called to mark the fields indicated in fieldmask as needing to be * logged when the transaction is committed. The inode must already be * associated with the given transaction. All we do here is record where the * inode was dirtied and mark the transaction and inode log item dirty; * everything else is done in the ->precommit log item operation after the * changes in the transaction have been completed. */ void xfs_trans_log_inode( struct xfs_trans *tp, struct xfs_inode *ip, uint flags) { struct xfs_inode_log_item *iip = ip->i_itemp; struct inode *inode = VFS_I(ip); ASSERT(iip); xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ASSERT(!xfs_iflags_test(ip, XFS_ISTALE)); tp->t_flags |= XFS_TRANS_DIRTY; /* * First time we log the inode in a transaction, bump the inode change * counter if it is configured for this to occur. While we have the * inode locked exclusively for metadata modification, we can usually * avoid setting XFS_ILOG_CORE if no one has queried the value since * the last time it was incremented. If we have XFS_ILOG_CORE already * set however, then go ahead and bump the i_version counter * unconditionally. */ if (!test_and_set_bit(XFS_LI_DIRTY, &iip->ili_item.li_flags)) { if (IS_I_VERSION(inode) && inode_maybe_inc_iversion(inode, flags & XFS_ILOG_CORE)) flags |= XFS_ILOG_IVERSION; } iip->ili_dirty_flags |= flags; } int xfs_trans_roll_inode( struct xfs_trans **tpp, struct xfs_inode *ip) { int error; xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); error = xfs_trans_roll(tpp); if (!error) xfs_trans_ijoin(*tpp, ip, 0); return error; } |
| 1 1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 | // SPDX-License-Identifier: GPL-2.0-or-later /* * V4L2 controls framework control definitions. * * Copyright (C) 2010-2021 Hans Verkuil <hverkuil-cisco@xs4all.nl> */ #include <linux/export.h> #include <media/v4l2-ctrls.h> /* * Returns NULL or a character pointer array containing the menu for * the given control ID. The pointer array ends with a NULL pointer. * An empty string signifies a menu entry that is invalid. This allows * drivers to disable certain options if it is not supported. */ const char * const *v4l2_ctrl_get_menu(u32 id) { static const char * const mpeg_audio_sampling_freq[] = { "44.1 kHz", "48 kHz", "32 kHz", NULL }; static const char * const mpeg_audio_encoding[] = { "MPEG-1/2 Layer I", "MPEG-1/2 Layer II", "MPEG-1/2 Layer III", "MPEG-2/4 AAC", "AC-3", NULL }; static const char * const mpeg_audio_l1_bitrate[] = { "32 kbps", "64 kbps", "96 kbps", "128 kbps", "160 kbps", "192 kbps", "224 kbps", "256 kbps", "288 kbps", "320 kbps", "352 kbps", "384 kbps", "416 kbps", "448 kbps", NULL }; static const char * const mpeg_audio_l2_bitrate[] = { "32 kbps", "48 kbps", "56 kbps", "64 kbps", "80 kbps", "96 kbps", "112 kbps", "128 kbps", "160 kbps", "192 kbps", "224 kbps", "256 kbps", "320 kbps", "384 kbps", NULL }; static const char * const mpeg_audio_l3_bitrate[] = { "32 kbps", "40 kbps", "48 kbps", "56 kbps", "64 kbps", "80 kbps", "96 kbps", "112 kbps", "128 kbps", "160 kbps", "192 kbps", "224 kbps", "256 kbps", "320 kbps", NULL }; static const char * const mpeg_audio_ac3_bitrate[] = { "32 kbps", "40 kbps", "48 kbps", "56 kbps", "64 kbps", "80 kbps", "96 kbps", "112 kbps", "128 kbps", "160 kbps", "192 kbps", "224 kbps", "256 kbps", "320 kbps", "384 kbps", "448 kbps", "512 kbps", "576 kbps", "640 kbps", NULL }; static const char * const mpeg_audio_mode[] = { "Stereo", "Joint Stereo", "Dual", "Mono", NULL }; static const char * const mpeg_audio_mode_extension[] = { "Bound 4", "Bound 8", "Bound 12", "Bound 16", NULL }; static const char * const mpeg_audio_emphasis[] = { "No Emphasis", "50/15 us", "CCITT J17", NULL }; static const char * const mpeg_audio_crc[] = { "No CRC", "16-bit CRC", NULL }; static const char * const mpeg_audio_dec_playback[] = { "Auto", "Stereo", "Left", "Right", "Mono", "Swapped Stereo", NULL }; static const char * const mpeg_video_encoding[] = { "MPEG-1", "MPEG-2", "MPEG-4 AVC", NULL }; static const char * const mpeg_video_aspect[] = { "1x1", "4x3", "16x9", "2.21x1", NULL }; static const char * const mpeg_video_bitrate_mode[] = { "Variable Bitrate", "Constant Bitrate", "Constant Quality", NULL }; static const char * const mpeg_stream_type[] = { "MPEG-2 Program Stream", "MPEG-2 Transport Stream", "MPEG-1 System Stream", "MPEG-2 DVD-compatible Stream", "MPEG-1 VCD-compatible Stream", "MPEG-2 SVCD-compatible Stream", NULL }; static const char * const mpeg_stream_vbi_fmt[] = { "No VBI", "Private Packet, IVTV Format", NULL }; static const char * const camera_power_line_frequency[] = { "Disabled", "50 Hz", "60 Hz", "Auto", NULL }; static const char * const camera_exposure_auto[] = { "Auto Mode", "Manual Mode", "Shutter Priority Mode", "Aperture Priority Mode", NULL }; static const char * const camera_exposure_metering[] = { "Average", "Center Weighted", "Spot", "Matrix", NULL }; static const char * const camera_auto_focus_range[] = { "Auto", "Normal", "Macro", "Infinity", NULL }; static const char * const colorfx[] = { "None", "Black & White", "Sepia", "Negative", "Emboss", "Sketch", "Sky Blue", "Grass Green", "Skin Whiten", "Vivid", "Aqua", "Art Freeze", "Silhouette", "Solarization", "Antique", "Set Cb/Cr", NULL }; static const char * const auto_n_preset_white_balance[] = { "Manual", "Auto", "Incandescent", "Fluorescent", "Fluorescent H", "Horizon", "Daylight", "Flash", "Cloudy", "Shade", NULL, }; static const char * const camera_iso_sensitivity_auto[] = { "Manual", "Auto", NULL }; static const char * const scene_mode[] = { "None", "Backlight", "Beach/Snow", "Candle Light", "Dusk/Dawn", "Fall Colors", "Fireworks", "Landscape", "Night", "Party/Indoor", "Portrait", "Sports", "Sunset", "Text", NULL }; static const char * const tune_emphasis[] = { "None", "50 Microseconds", "75 Microseconds", NULL, }; static const char * const header_mode[] = { "Separate Buffer", "Joined With 1st Frame", NULL, }; static const char * const multi_slice[] = { "Single", "Max Macroblocks", "Max Bytes", NULL, }; static const char * const entropy_mode[] = { "CAVLC", "CABAC", NULL, }; static const char * const mpeg_h264_level[] = { "1", "1b", "1.1", "1.2", "1.3", "2", "2.1", "2.2", "3", "3.1", "3.2", "4", "4.1", "4.2", "5", "5.1", "5.2", "6.0", "6.1", "6.2", NULL, }; static const char * const h264_loop_filter[] = { "Enabled", "Disabled", "Disabled at Slice Boundary", NULL, }; static const char * const h264_profile[] = { "Baseline", "Constrained Baseline", "Main", "Extended", "High", "High 10", "High 422", "High 444 Predictive", "High 10 Intra", "High 422 Intra", "High 444 Intra", "CAVLC 444 Intra", "Scalable Baseline", "Scalable High", "Scalable High Intra", "Stereo High", "Multiview High", "Constrained High", NULL, }; static const char * const vui_sar_idc[] = { "Unspecified", "1:1", "12:11", "10:11", "16:11", "40:33", "24:11", "20:11", "32:11", "80:33", "18:11", "15:11", "64:33", "160:99", "4:3", "3:2", "2:1", "Extended SAR", NULL, }; static const char * const h264_fp_arrangement_type[] = { "Checkerboard", "Column", "Row", "Side by Side", "Top Bottom", "Temporal", NULL, }; static const char * const h264_fmo_map_type[] = { "Interleaved Slices", "Scattered Slices", "Foreground with Leftover", "Box Out", "Raster Scan", "Wipe Scan", "Explicit", NULL, }; static const char * const h264_decode_mode[] = { "Slice-Based", "Frame-Based", NULL, }; static const char * const h264_start_code[] = { "No Start Code", "Annex B Start Code", NULL, }; static const char * const h264_hierarchical_coding_type[] = { "Hier Coding B", "Hier Coding P", NULL, }; static const char * const mpeg_mpeg2_level[] = { "Low", "Main", "High 1440", "High", NULL, }; static const char * const mpeg2_profile[] = { "Simple", "Main", "SNR Scalable", "Spatially Scalable", "High", NULL, }; static const char * const mpeg_mpeg4_level[] = { "0", "0b", "1", "2", "3", "3b", "4", "5", NULL, }; static const char * const mpeg4_profile[] = { "Simple", "Advanced Simple", "Core", "Simple Scalable", "Advanced Coding Efficiency", NULL, }; static const char * const vpx_golden_frame_sel[] = { "Use Previous Frame", "Use Previous Specific Frame", NULL, }; static const char * const vp8_profile[] = { "0", "1", "2", "3", NULL, }; static const char * const vp9_profile[] = { "0", "1", "2", "3", NULL, }; static const char * const vp9_level[] = { "1", "1.1", "2", "2.1", "3", "3.1", "4", "4.1", "5", "5.1", "5.2", "6", "6.1", "6.2", NULL, }; static const char * const flash_led_mode[] = { "Off", "Flash", "Torch", NULL, }; static const char * const flash_strobe_source[] = { "Software", "External", NULL, }; static const char * const jpeg_chroma_subsampling[] = { "4:4:4", "4:2:2", "4:2:0", "4:1:1", "4:1:0", "Gray", NULL, }; static const char * const dv_tx_mode[] = { "DVI-D", "HDMI", NULL, }; static const char * const dv_rgb_range[] = { "Automatic", "RGB Limited Range (16-235)", "RGB Full Range (0-255)", NULL, }; static const char * const dv_it_content_type[] = { "Graphics", "Photo", "Cinema", "Game", "No IT Content", NULL, }; static const char * const detect_md_mode[] = { "Disabled", "Global", "Threshold Grid", "Region Grid", NULL, }; static const char * const av1_profile[] = { "Main", "High", "Professional", NULL, }; static const char * const av1_level[] = { "2.0", "2.1", "2.2", "2.3", "3.0", "3.1", "3.2", "3.3", "4.0", "4.1", "4.2", "4.3", "5.0", "5.1", "5.2", "5.3", "6.0", "6.1", "6.2", "6.3", "7.0", "7.1", "7.2", "7.3", NULL, }; static const char * const hevc_profile[] = { "Main", "Main Still Picture", "Main 10", NULL, }; static const char * const hevc_level[] = { "1", "2", "2.1", "3", "3.1", "4", "4.1", "5", "5.1", "5.2", "6", "6.1", "6.2", NULL, }; static const char * const hevc_hierarchial_coding_type[] = { "B", "P", NULL, }; static const char * const hevc_refresh_type[] = { "None", "CRA", "IDR", NULL, }; static const char * const hevc_size_of_length_field[] = { "0", "1", "2", "4", NULL, }; static const char * const hevc_tier[] = { "Main", "High", NULL, }; static const char * const hevc_loop_filter_mode[] = { "Disabled", "Enabled", "Disabled at slice boundary", "NULL", }; static const char * const hevc_decode_mode[] = { "Slice-Based", "Frame-Based", NULL, }; static const char * const hevc_start_code[] = { "No Start Code", "Annex B Start Code", NULL, }; static const char * const camera_orientation[] = { "Front", "Back", "External", NULL, }; static const char * const mpeg_video_frame_skip[] = { "Disabled", "Level Limit", "VBV/CPB Limit", NULL, }; static const char * const intra_refresh_period_type[] = { "Random", "Cyclic", NULL, }; switch (id) { case V4L2_CID_MPEG_AUDIO_SAMPLING_FREQ: return mpeg_audio_sampling_freq; case V4L2_CID_MPEG_AUDIO_ENCODING: return mpeg_audio_encoding; case V4L2_CID_MPEG_AUDIO_L1_BITRATE: return mpeg_audio_l1_bitrate; case V4L2_CID_MPEG_AUDIO_L2_BITRATE: return mpeg_audio_l2_bitrate; case V4L2_CID_MPEG_AUDIO_L3_BITRATE: return mpeg_audio_l3_bitrate; case V4L2_CID_MPEG_AUDIO_AC3_BITRATE: return mpeg_audio_ac3_bitrate; case V4L2_CID_MPEG_AUDIO_MODE: return mpeg_audio_mode; case V4L2_CID_MPEG_AUDIO_MODE_EXTENSION: return mpeg_audio_mode_extension; case V4L2_CID_MPEG_AUDIO_EMPHASIS: return mpeg_audio_emphasis; case V4L2_CID_MPEG_AUDIO_CRC: return mpeg_audio_crc; case V4L2_CID_MPEG_AUDIO_DEC_PLAYBACK: case V4L2_CID_MPEG_AUDIO_DEC_MULTILINGUAL_PLAYBACK: return mpeg_audio_dec_playback; case V4L2_CID_MPEG_VIDEO_ENCODING: return mpeg_video_encoding; case V4L2_CID_MPEG_VIDEO_ASPECT: return mpeg_video_aspect; case V4L2_CID_MPEG_VIDEO_BITRATE_MODE: return mpeg_video_bitrate_mode; case V4L2_CID_MPEG_STREAM_TYPE: return mpeg_stream_type; case V4L2_CID_MPEG_STREAM_VBI_FMT: return mpeg_stream_vbi_fmt; case V4L2_CID_POWER_LINE_FREQUENCY: return camera_power_line_frequency; case V4L2_CID_EXPOSURE_AUTO: return camera_exposure_auto; case V4L2_CID_EXPOSURE_METERING: return camera_exposure_metering; case V4L2_CID_AUTO_FOCUS_RANGE: return camera_auto_focus_range; case V4L2_CID_COLORFX: return colorfx; case V4L2_CID_AUTO_N_PRESET_WHITE_BALANCE: return auto_n_preset_white_balance; case V4L2_CID_ISO_SENSITIVITY_AUTO: return camera_iso_sensitivity_auto; case V4L2_CID_SCENE_MODE: return scene_mode; case V4L2_CID_TUNE_PREEMPHASIS: return tune_emphasis; case V4L2_CID_TUNE_DEEMPHASIS: return tune_emphasis; case V4L2_CID_FLASH_LED_MODE: return flash_led_mode; case V4L2_CID_FLASH_STROBE_SOURCE: return flash_strobe_source; case V4L2_CID_MPEG_VIDEO_HEADER_MODE: return header_mode; case V4L2_CID_MPEG_VIDEO_FRAME_SKIP_MODE: return mpeg_video_frame_skip; case V4L2_CID_MPEG_VIDEO_MULTI_SLICE_MODE: return multi_slice; case V4L2_CID_MPEG_VIDEO_H264_ENTROPY_MODE: return entropy_mode; case V4L2_CID_MPEG_VIDEO_H264_LEVEL: return mpeg_h264_level; case V4L2_CID_MPEG_VIDEO_H264_LOOP_FILTER_MODE: return h264_loop_filter; case V4L2_CID_MPEG_VIDEO_H264_PROFILE: return h264_profile; case V4L2_CID_MPEG_VIDEO_H264_VUI_SAR_IDC: return vui_sar_idc; case V4L2_CID_MPEG_VIDEO_H264_SEI_FP_ARRANGEMENT_TYPE: return h264_fp_arrangement_type; case V4L2_CID_MPEG_VIDEO_H264_FMO_MAP_TYPE: return h264_fmo_map_type; case V4L2_CID_STATELESS_H264_DECODE_MODE: return h264_decode_mode; case V4L2_CID_STATELESS_H264_START_CODE: return h264_start_code; case V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_TYPE: return h264_hierarchical_coding_type; case V4L2_CID_MPEG_VIDEO_MPEG2_LEVEL: return mpeg_mpeg2_level; case V4L2_CID_MPEG_VIDEO_MPEG2_PROFILE: return mpeg2_profile; case V4L2_CID_MPEG_VIDEO_MPEG4_LEVEL: return mpeg_mpeg4_level; case V4L2_CID_MPEG_VIDEO_MPEG4_PROFILE: return mpeg4_profile; case V4L2_CID_MPEG_VIDEO_VPX_GOLDEN_FRAME_SEL: return vpx_golden_frame_sel; case V4L2_CID_MPEG_VIDEO_VP8_PROFILE: return vp8_profile; case V4L2_CID_MPEG_VIDEO_VP9_PROFILE: return vp9_profile; case V4L2_CID_MPEG_VIDEO_VP9_LEVEL: return vp9_level; case V4L2_CID_JPEG_CHROMA_SUBSAMPLING: return jpeg_chroma_subsampling; case V4L2_CID_DV_TX_MODE: return dv_tx_mode; case V4L2_CID_DV_TX_RGB_RANGE: case V4L2_CID_DV_RX_RGB_RANGE: return dv_rgb_range; case V4L2_CID_DV_TX_IT_CONTENT_TYPE: case V4L2_CID_DV_RX_IT_CONTENT_TYPE: return dv_it_content_type; case V4L2_CID_DETECT_MD_MODE: return detect_md_mode; case V4L2_CID_MPEG_VIDEO_HEVC_PROFILE: return hevc_profile; case V4L2_CID_MPEG_VIDEO_HEVC_LEVEL: return hevc_level; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_TYPE: return hevc_hierarchial_coding_type; case V4L2_CID_MPEG_VIDEO_HEVC_REFRESH_TYPE: return hevc_refresh_type; case V4L2_CID_MPEG_VIDEO_HEVC_SIZE_OF_LENGTH_FIELD: return hevc_size_of_length_field; case V4L2_CID_MPEG_VIDEO_HEVC_TIER: return hevc_tier; case V4L2_CID_MPEG_VIDEO_HEVC_LOOP_FILTER_MODE: return hevc_loop_filter_mode; case V4L2_CID_MPEG_VIDEO_AV1_PROFILE: return av1_profile; case V4L2_CID_MPEG_VIDEO_AV1_LEVEL: return av1_level; case V4L2_CID_STATELESS_HEVC_DECODE_MODE: return hevc_decode_mode; case V4L2_CID_STATELESS_HEVC_START_CODE: return hevc_start_code; case V4L2_CID_CAMERA_ORIENTATION: return camera_orientation; case V4L2_CID_MPEG_VIDEO_INTRA_REFRESH_PERIOD_TYPE: return intra_refresh_period_type; default: return NULL; } } EXPORT_SYMBOL(v4l2_ctrl_get_menu); #define __v4l2_qmenu_int_len(arr, len) ({ *(len) = ARRAY_SIZE(arr); (arr); }) /* * Returns NULL or an s64 type array containing the menu for given * control ID. The total number of the menu items is returned in @len. */ const s64 *v4l2_ctrl_get_int_menu(u32 id, u32 *len) { static const s64 qmenu_int_vpx_num_partitions[] = { 1, 2, 4, 8, }; static const s64 qmenu_int_vpx_num_ref_frames[] = { 1, 2, 3, }; switch (id) { case V4L2_CID_MPEG_VIDEO_VPX_NUM_PARTITIONS: return __v4l2_qmenu_int_len(qmenu_int_vpx_num_partitions, len); case V4L2_CID_MPEG_VIDEO_VPX_NUM_REF_FRAMES: return __v4l2_qmenu_int_len(qmenu_int_vpx_num_ref_frames, len); default: *len = 0; return NULL; } } EXPORT_SYMBOL(v4l2_ctrl_get_int_menu); /* Return the control name. */ const char *v4l2_ctrl_get_name(u32 id) { switch (id) { /* USER controls */ /* Keep the order of the 'case's the same as in v4l2-controls.h! */ case V4L2_CID_USER_CLASS: return "User Controls"; case V4L2_CID_BRIGHTNESS: return "Brightness"; case V4L2_CID_CONTRAST: return "Contrast"; case V4L2_CID_SATURATION: return "Saturation"; case V4L2_CID_HUE: return "Hue"; case V4L2_CID_AUDIO_VOLUME: return "Volume"; case V4L2_CID_AUDIO_BALANCE: return "Balance"; case V4L2_CID_AUDIO_BASS: return "Bass"; case V4L2_CID_AUDIO_TREBLE: return "Treble"; case V4L2_CID_AUDIO_MUTE: return "Mute"; case V4L2_CID_AUDIO_LOUDNESS: return "Loudness"; case V4L2_CID_BLACK_LEVEL: return "Black Level"; case V4L2_CID_AUTO_WHITE_BALANCE: return "White Balance, Automatic"; case V4L2_CID_DO_WHITE_BALANCE: return "Do White Balance"; case V4L2_CID_RED_BALANCE: return "Red Balance"; case V4L2_CID_BLUE_BALANCE: return "Blue Balance"; case V4L2_CID_GAMMA: return "Gamma"; case V4L2_CID_EXPOSURE: return "Exposure"; case V4L2_CID_AUTOGAIN: return "Gain, Automatic"; case V4L2_CID_GAIN: return "Gain"; case V4L2_CID_HFLIP: return "Horizontal Flip"; case V4L2_CID_VFLIP: return "Vertical Flip"; case V4L2_CID_POWER_LINE_FREQUENCY: return "Power Line Frequency"; case V4L2_CID_HUE_AUTO: return "Hue, Automatic"; case V4L2_CID_WHITE_BALANCE_TEMPERATURE: return "White Balance Temperature"; case V4L2_CID_SHARPNESS: return "Sharpness"; case V4L2_CID_BACKLIGHT_COMPENSATION: return "Backlight Compensation"; case V4L2_CID_CHROMA_AGC: return "Chroma AGC"; case V4L2_CID_COLOR_KILLER: return "Color Killer"; case V4L2_CID_COLORFX: return "Color Effects"; case V4L2_CID_AUTOBRIGHTNESS: return "Brightness, Automatic"; case V4L2_CID_BAND_STOP_FILTER: return "Band-Stop Filter"; case V4L2_CID_ROTATE: return "Rotate"; case V4L2_CID_BG_COLOR: return "Background Color"; case V4L2_CID_CHROMA_GAIN: return "Chroma Gain"; case V4L2_CID_ILLUMINATORS_1: return "Illuminator 1"; case V4L2_CID_ILLUMINATORS_2: return "Illuminator 2"; case V4L2_CID_MIN_BUFFERS_FOR_CAPTURE: return "Min Number of Capture Buffers"; case V4L2_CID_MIN_BUFFERS_FOR_OUTPUT: return "Min Number of Output Buffers"; case V4L2_CID_ALPHA_COMPONENT: return "Alpha Component"; case V4L2_CID_COLORFX_CBCR: return "Color Effects, CbCr"; case V4L2_CID_COLORFX_RGB: return "Color Effects, RGB"; /* * Codec controls * * The MPEG controls are applicable to all codec controls * and the 'MPEG' part of the define is historical. * * Keep the order of the 'case's the same as in videodev2.h! */ case V4L2_CID_CODEC_CLASS: return "Codec Controls"; case V4L2_CID_MPEG_STREAM_TYPE: return "Stream Type"; case V4L2_CID_MPEG_STREAM_PID_PMT: return "Stream PMT Program ID"; case V4L2_CID_MPEG_STREAM_PID_AUDIO: return "Stream Audio Program ID"; case V4L2_CID_MPEG_STREAM_PID_VIDEO: return "Stream Video Program ID"; case V4L2_CID_MPEG_STREAM_PID_PCR: return "Stream PCR Program ID"; case V4L2_CID_MPEG_STREAM_PES_ID_AUDIO: return "Stream PES Audio ID"; case V4L2_CID_MPEG_STREAM_PES_ID_VIDEO: return "Stream PES Video ID"; case V4L2_CID_MPEG_STREAM_VBI_FMT: return "Stream VBI Format"; case V4L2_CID_MPEG_AUDIO_SAMPLING_FREQ: return "Audio Sampling Frequency"; case V4L2_CID_MPEG_AUDIO_ENCODING: return "Audio Encoding"; case V4L2_CID_MPEG_AUDIO_L1_BITRATE: return "Audio Layer I Bitrate"; case V4L2_CID_MPEG_AUDIO_L2_BITRATE: return "Audio Layer II Bitrate"; case V4L2_CID_MPEG_AUDIO_L3_BITRATE: return "Audio Layer III Bitrate"; case V4L2_CID_MPEG_AUDIO_MODE: return "Audio Stereo Mode"; case V4L2_CID_MPEG_AUDIO_MODE_EXTENSION: return "Audio Stereo Mode Extension"; case V4L2_CID_MPEG_AUDIO_EMPHASIS: return "Audio Emphasis"; case V4L2_CID_MPEG_AUDIO_CRC: return "Audio CRC"; case V4L2_CID_MPEG_AUDIO_MUTE: return "Audio Mute"; case V4L2_CID_MPEG_AUDIO_AAC_BITRATE: return "Audio AAC Bitrate"; case V4L2_CID_MPEG_AUDIO_AC3_BITRATE: return "Audio AC-3 Bitrate"; case V4L2_CID_MPEG_AUDIO_DEC_PLAYBACK: return "Audio Playback"; case V4L2_CID_MPEG_AUDIO_DEC_MULTILINGUAL_PLAYBACK: return "Audio Multilingual Playback"; case V4L2_CID_MPEG_VIDEO_ENCODING: return "Video Encoding"; case V4L2_CID_MPEG_VIDEO_ASPECT: return "Video Aspect"; case V4L2_CID_MPEG_VIDEO_B_FRAMES: return "Video B Frames"; case V4L2_CID_MPEG_VIDEO_GOP_SIZE: return "Video GOP Size"; case V4L2_CID_MPEG_VIDEO_GOP_CLOSURE: return "Video GOP Closure"; case V4L2_CID_MPEG_VIDEO_PULLDOWN: return "Video Pulldown"; case V4L2_CID_MPEG_VIDEO_BITRATE_MODE: return "Video Bitrate Mode"; case V4L2_CID_MPEG_VIDEO_CONSTANT_QUALITY: return "Constant Quality"; case V4L2_CID_MPEG_VIDEO_BITRATE: return "Video Bitrate"; case V4L2_CID_MPEG_VIDEO_BITRATE_PEAK: return "Video Peak Bitrate"; case V4L2_CID_MPEG_VIDEO_TEMPORAL_DECIMATION: return "Video Temporal Decimation"; case V4L2_CID_MPEG_VIDEO_MUTE: return "Video Mute"; case V4L2_CID_MPEG_VIDEO_MUTE_YUV: return "Video Mute YUV"; case V4L2_CID_MPEG_VIDEO_DECODER_SLICE_INTERFACE: return "Decoder Slice Interface"; case V4L2_CID_MPEG_VIDEO_DECODER_MPEG4_DEBLOCK_FILTER: return "MPEG4 Loop Filter Enable"; case V4L2_CID_MPEG_VIDEO_CYCLIC_INTRA_REFRESH_MB: return "Number of Intra Refresh MBs"; case V4L2_CID_MPEG_VIDEO_INTRA_REFRESH_PERIOD_TYPE: return "Intra Refresh Period Type"; case V4L2_CID_MPEG_VIDEO_INTRA_REFRESH_PERIOD: return "Intra Refresh Period"; case V4L2_CID_MPEG_VIDEO_FRAME_RC_ENABLE: return "Frame Level Rate Control Enable"; case V4L2_CID_MPEG_VIDEO_MB_RC_ENABLE: return "H264 MB Level Rate Control"; case V4L2_CID_MPEG_VIDEO_HEADER_MODE: return "Sequence Header Mode"; case V4L2_CID_MPEG_VIDEO_MAX_REF_PIC: return "Max Number of Reference Pics"; case V4L2_CID_MPEG_VIDEO_FRAME_SKIP_MODE: return "Frame Skip Mode"; case V4L2_CID_MPEG_VIDEO_DEC_DISPLAY_DELAY: return "Display Delay"; case V4L2_CID_MPEG_VIDEO_DEC_DISPLAY_DELAY_ENABLE: return "Display Delay Enable"; case V4L2_CID_MPEG_VIDEO_AU_DELIMITER: return "Generate Access Unit Delimiters"; case V4L2_CID_MPEG_VIDEO_H263_I_FRAME_QP: return "H263 I-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_H263_P_FRAME_QP: return "H263 P-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_H263_B_FRAME_QP: return "H263 B-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_H263_MIN_QP: return "H263 Minimum QP Value"; case V4L2_CID_MPEG_VIDEO_H263_MAX_QP: return "H263 Maximum QP Value"; case V4L2_CID_MPEG_VIDEO_H264_I_FRAME_QP: return "H264 I-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_H264_P_FRAME_QP: return "H264 P-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_H264_B_FRAME_QP: return "H264 B-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_H264_MAX_QP: return "H264 Maximum QP Value"; case V4L2_CID_MPEG_VIDEO_H264_MIN_QP: return "H264 Minimum QP Value"; case V4L2_CID_MPEG_VIDEO_H264_8X8_TRANSFORM: return "H264 8x8 Transform Enable"; case V4L2_CID_MPEG_VIDEO_H264_CPB_SIZE: return "H264 CPB Buffer Size"; case V4L2_CID_MPEG_VIDEO_H264_ENTROPY_MODE: return "H264 Entropy Mode"; case V4L2_CID_MPEG_VIDEO_H264_I_PERIOD: return "H264 I-Frame Period"; case V4L2_CID_MPEG_VIDEO_H264_LEVEL: return "H264 Level"; case V4L2_CID_MPEG_VIDEO_H264_LOOP_FILTER_ALPHA: return "H264 Loop Filter Alpha Offset"; case V4L2_CID_MPEG_VIDEO_H264_LOOP_FILTER_BETA: return "H264 Loop Filter Beta Offset"; case V4L2_CID_MPEG_VIDEO_H264_LOOP_FILTER_MODE: return "H264 Loop Filter Mode"; case V4L2_CID_MPEG_VIDEO_H264_PROFILE: return "H264 Profile"; case V4L2_CID_MPEG_VIDEO_H264_VUI_EXT_SAR_HEIGHT: return "Vertical Size of SAR"; case V4L2_CID_MPEG_VIDEO_H264_VUI_EXT_SAR_WIDTH: return "Horizontal Size of SAR"; case V4L2_CID_MPEG_VIDEO_H264_VUI_SAR_ENABLE: return "Aspect Ratio VUI Enable"; case V4L2_CID_MPEG_VIDEO_H264_VUI_SAR_IDC: return "VUI Aspect Ratio IDC"; case V4L2_CID_MPEG_VIDEO_H264_SEI_FRAME_PACKING: return "H264 Enable Frame Packing SEI"; case V4L2_CID_MPEG_VIDEO_H264_SEI_FP_CURRENT_FRAME_0: return "H264 Set Curr. Frame as Frame0"; case V4L2_CID_MPEG_VIDEO_H264_SEI_FP_ARRANGEMENT_TYPE: return "H264 FP Arrangement Type"; case V4L2_CID_MPEG_VIDEO_H264_FMO: return "H264 Flexible MB Ordering"; case V4L2_CID_MPEG_VIDEO_H264_FMO_MAP_TYPE: return "H264 Map Type for FMO"; case V4L2_CID_MPEG_VIDEO_H264_FMO_SLICE_GROUP: return "H264 FMO Number of Slice Groups"; case V4L2_CID_MPEG_VIDEO_H264_FMO_CHANGE_DIRECTION: return "H264 FMO Direction of Change"; case V4L2_CID_MPEG_VIDEO_H264_FMO_CHANGE_RATE: return "H264 FMO Size of 1st Slice Grp"; case V4L2_CID_MPEG_VIDEO_H264_FMO_RUN_LENGTH: return "H264 FMO No. of Consecutive MBs"; case V4L2_CID_MPEG_VIDEO_H264_ASO: return "H264 Arbitrary Slice Ordering"; case V4L2_CID_MPEG_VIDEO_H264_ASO_SLICE_ORDER: return "H264 ASO Slice Order"; case V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING: return "Enable H264 Hierarchical Coding"; case V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_TYPE: return "H264 Hierarchical Coding Type"; case V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_LAYER:return "H264 Number of HC Layers"; case V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_LAYER_QP: return "H264 Set QP Value for HC Layers"; case V4L2_CID_MPEG_VIDEO_H264_CONSTRAINED_INTRA_PREDICTION: return "H264 Constrained Intra Pred"; case V4L2_CID_MPEG_VIDEO_H264_CHROMA_QP_INDEX_OFFSET: return "H264 Chroma QP Index Offset"; case V4L2_CID_MPEG_VIDEO_H264_I_FRAME_MIN_QP: return "H264 I-Frame Minimum QP Value"; case V4L2_CID_MPEG_VIDEO_H264_I_FRAME_MAX_QP: return "H264 I-Frame Maximum QP Value"; case V4L2_CID_MPEG_VIDEO_H264_P_FRAME_MIN_QP: return "H264 P-Frame Minimum QP Value"; case V4L2_CID_MPEG_VIDEO_H264_P_FRAME_MAX_QP: return "H264 P-Frame Maximum QP Value"; case V4L2_CID_MPEG_VIDEO_H264_B_FRAME_MIN_QP: return "H264 B-Frame Minimum QP Value"; case V4L2_CID_MPEG_VIDEO_H264_B_FRAME_MAX_QP: return "H264 B-Frame Maximum QP Value"; case V4L2_CID_MPEG_VIDEO_H264_HIER_CODING_L0_BR: return "H264 Hierarchical Lay 0 Bitrate"; case V4L2_CID_MPEG_VIDEO_H264_HIER_CODING_L1_BR: return "H264 Hierarchical Lay 1 Bitrate"; case V4L2_CID_MPEG_VIDEO_H264_HIER_CODING_L2_BR: return "H264 Hierarchical Lay 2 Bitrate"; case V4L2_CID_MPEG_VIDEO_H264_HIER_CODING_L3_BR: return "H264 Hierarchical Lay 3 Bitrate"; case V4L2_CID_MPEG_VIDEO_H264_HIER_CODING_L4_BR: return "H264 Hierarchical Lay 4 Bitrate"; case V4L2_CID_MPEG_VIDEO_H264_HIER_CODING_L5_BR: return "H264 Hierarchical Lay 5 Bitrate"; case V4L2_CID_MPEG_VIDEO_H264_HIER_CODING_L6_BR: return "H264 Hierarchical Lay 6 Bitrate"; case V4L2_CID_MPEG_VIDEO_MPEG2_LEVEL: return "MPEG2 Level"; case V4L2_CID_MPEG_VIDEO_MPEG2_PROFILE: return "MPEG2 Profile"; case V4L2_CID_MPEG_VIDEO_MPEG4_I_FRAME_QP: return "MPEG4 I-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_MPEG4_P_FRAME_QP: return "MPEG4 P-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_MPEG4_B_FRAME_QP: return "MPEG4 B-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_MPEG4_MIN_QP: return "MPEG4 Minimum QP Value"; case V4L2_CID_MPEG_VIDEO_MPEG4_MAX_QP: return "MPEG4 Maximum QP Value"; case V4L2_CID_MPEG_VIDEO_MPEG4_LEVEL: return "MPEG4 Level"; case V4L2_CID_MPEG_VIDEO_MPEG4_PROFILE: return "MPEG4 Profile"; case V4L2_CID_MPEG_VIDEO_MPEG4_QPEL: return "Quarter Pixel Search Enable"; case V4L2_CID_MPEG_VIDEO_MULTI_SLICE_MAX_BYTES: return "Maximum Bytes in a Slice"; case V4L2_CID_MPEG_VIDEO_MULTI_SLICE_MAX_MB: return "Number of MBs in a Slice"; case V4L2_CID_MPEG_VIDEO_MULTI_SLICE_MODE: return "Slice Partitioning Method"; case V4L2_CID_MPEG_VIDEO_VBV_SIZE: return "VBV Buffer Size"; case V4L2_CID_MPEG_VIDEO_DEC_PTS: return "Video Decoder PTS"; case V4L2_CID_MPEG_VIDEO_DEC_FRAME: return "Video Decoder Frame Count"; case V4L2_CID_MPEG_VIDEO_DEC_CONCEAL_COLOR: return "Video Decoder Conceal Color"; case V4L2_CID_MPEG_VIDEO_VBV_DELAY: return "Initial Delay for VBV Control"; case V4L2_CID_MPEG_VIDEO_MV_H_SEARCH_RANGE: return "Horizontal MV Search Range"; case V4L2_CID_MPEG_VIDEO_MV_V_SEARCH_RANGE: return "Vertical MV Search Range"; case V4L2_CID_MPEG_VIDEO_REPEAT_SEQ_HEADER: return "Repeat Sequence Header"; case V4L2_CID_MPEG_VIDEO_FORCE_KEY_FRAME: return "Force Key Frame"; case V4L2_CID_MPEG_VIDEO_BASELAYER_PRIORITY_ID: return "Base Layer Priority ID"; case V4L2_CID_MPEG_VIDEO_LTR_COUNT: return "LTR Count"; case V4L2_CID_MPEG_VIDEO_FRAME_LTR_INDEX: return "Frame LTR Index"; case V4L2_CID_MPEG_VIDEO_USE_LTR_FRAMES: return "Use LTR Frames"; case V4L2_CID_MPEG_VIDEO_AVERAGE_QP: return "Average QP Value"; case V4L2_CID_FWHT_I_FRAME_QP: return "FWHT I-Frame QP Value"; case V4L2_CID_FWHT_P_FRAME_QP: return "FWHT P-Frame QP Value"; /* VPX controls */ case V4L2_CID_MPEG_VIDEO_VPX_NUM_PARTITIONS: return "VPX Number of Partitions"; case V4L2_CID_MPEG_VIDEO_VPX_IMD_DISABLE_4X4: return "VPX Intra Mode Decision Disable"; case V4L2_CID_MPEG_VIDEO_VPX_NUM_REF_FRAMES: return "VPX No. of Refs for P Frame"; case V4L2_CID_MPEG_VIDEO_VPX_FILTER_LEVEL: return "VPX Loop Filter Level Range"; case V4L2_CID_MPEG_VIDEO_VPX_FILTER_SHARPNESS: return "VPX Deblocking Effect Control"; case V4L2_CID_MPEG_VIDEO_VPX_GOLDEN_FRAME_REF_PERIOD: return "VPX Golden Frame Refresh Period"; case V4L2_CID_MPEG_VIDEO_VPX_GOLDEN_FRAME_SEL: return "VPX Golden Frame Indicator"; case V4L2_CID_MPEG_VIDEO_VPX_MIN_QP: return "VPX Minimum QP Value"; case V4L2_CID_MPEG_VIDEO_VPX_MAX_QP: return "VPX Maximum QP Value"; case V4L2_CID_MPEG_VIDEO_VPX_I_FRAME_QP: return "VPX I-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_VPX_P_FRAME_QP: return "VPX P-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_VP8_PROFILE: return "VP8 Profile"; case V4L2_CID_MPEG_VIDEO_VP9_PROFILE: return "VP9 Profile"; case V4L2_CID_MPEG_VIDEO_VP9_LEVEL: return "VP9 Level"; /* HEVC controls */ case V4L2_CID_MPEG_VIDEO_HEVC_I_FRAME_QP: return "HEVC I-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_HEVC_P_FRAME_QP: return "HEVC P-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_HEVC_B_FRAME_QP: return "HEVC B-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_HEVC_MIN_QP: return "HEVC Minimum QP Value"; case V4L2_CID_MPEG_VIDEO_HEVC_MAX_QP: return "HEVC Maximum QP Value"; case V4L2_CID_MPEG_VIDEO_HEVC_I_FRAME_MIN_QP: return "HEVC I-Frame Minimum QP Value"; case V4L2_CID_MPEG_VIDEO_HEVC_I_FRAME_MAX_QP: return "HEVC I-Frame Maximum QP Value"; case V4L2_CID_MPEG_VIDEO_HEVC_P_FRAME_MIN_QP: return "HEVC P-Frame Minimum QP Value"; case V4L2_CID_MPEG_VIDEO_HEVC_P_FRAME_MAX_QP: return "HEVC P-Frame Maximum QP Value"; case V4L2_CID_MPEG_VIDEO_HEVC_B_FRAME_MIN_QP: return "HEVC B-Frame Minimum QP Value"; case V4L2_CID_MPEG_VIDEO_HEVC_B_FRAME_MAX_QP: return "HEVC B-Frame Maximum QP Value"; case V4L2_CID_MPEG_VIDEO_HEVC_PROFILE: return "HEVC Profile"; case V4L2_CID_MPEG_VIDEO_HEVC_LEVEL: return "HEVC Level"; case V4L2_CID_MPEG_VIDEO_HEVC_TIER: return "HEVC Tier"; case V4L2_CID_MPEG_VIDEO_HEVC_FRAME_RATE_RESOLUTION: return "HEVC Frame Rate Resolution"; case V4L2_CID_MPEG_VIDEO_HEVC_MAX_PARTITION_DEPTH: return "HEVC Maximum Coding Unit Depth"; case V4L2_CID_MPEG_VIDEO_HEVC_REFRESH_TYPE: return "HEVC Refresh Type"; case V4L2_CID_MPEG_VIDEO_HEVC_CONST_INTRA_PRED: return "HEVC Constant Intra Prediction"; case V4L2_CID_MPEG_VIDEO_HEVC_LOSSLESS_CU: return "HEVC Lossless Encoding"; case V4L2_CID_MPEG_VIDEO_HEVC_WAVEFRONT: return "HEVC Wavefront"; case V4L2_CID_MPEG_VIDEO_HEVC_LOOP_FILTER_MODE: return "HEVC Loop Filter"; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_QP: return "HEVC QP Values"; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_TYPE: return "HEVC Hierarchical Coding Type"; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_LAYER: return "HEVC Hierarchical Coding Layer"; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_L0_QP: return "HEVC Hierarchical Layer 0 QP"; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_L1_QP: return "HEVC Hierarchical Layer 1 QP"; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_L2_QP: return "HEVC Hierarchical Layer 2 QP"; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_L3_QP: return "HEVC Hierarchical Layer 3 QP"; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_L4_QP: return "HEVC Hierarchical Layer 4 QP"; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_L5_QP: return "HEVC Hierarchical Layer 5 QP"; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_L6_QP: return "HEVC Hierarchical Layer 6 QP"; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_L0_BR: return "HEVC Hierarchical Lay 0 BitRate"; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_L1_BR: return "HEVC Hierarchical Lay 1 BitRate"; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_L2_BR: return "HEVC Hierarchical Lay 2 BitRate"; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_L3_BR: return "HEVC Hierarchical Lay 3 BitRate"; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_L4_BR: return "HEVC Hierarchical Lay 4 BitRate"; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_L5_BR: return "HEVC Hierarchical Lay 5 BitRate"; case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_L6_BR: return "HEVC Hierarchical Lay 6 BitRate"; case V4L2_CID_MPEG_VIDEO_HEVC_GENERAL_PB: return "HEVC General PB"; case V4L2_CID_MPEG_VIDEO_HEVC_TEMPORAL_ID: return "HEVC Temporal ID"; case V4L2_CID_MPEG_VIDEO_HEVC_STRONG_SMOOTHING: return "HEVC Strong Intra Smoothing"; case V4L2_CID_MPEG_VIDEO_HEVC_INTRA_PU_SPLIT: return "HEVC Intra PU Split"; case V4L2_CID_MPEG_VIDEO_HEVC_TMV_PREDICTION: return "HEVC TMV Prediction"; case V4L2_CID_MPEG_VIDEO_HEVC_MAX_NUM_MERGE_MV_MINUS1: return "HEVC Max Num of Candidate MVs"; case V4L2_CID_MPEG_VIDEO_HEVC_WITHOUT_STARTCODE: return "HEVC ENC Without Startcode"; case V4L2_CID_MPEG_VIDEO_HEVC_REFRESH_PERIOD: return "HEVC Num of I-Frame b/w 2 IDR"; case V4L2_CID_MPEG_VIDEO_HEVC_LF_BETA_OFFSET_DIV2: return "HEVC Loop Filter Beta Offset"; case V4L2_CID_MPEG_VIDEO_HEVC_LF_TC_OFFSET_DIV2: return "HEVC Loop Filter TC Offset"; case V4L2_CID_MPEG_VIDEO_HEVC_SIZE_OF_LENGTH_FIELD: return "HEVC Size of Length Field"; case V4L2_CID_MPEG_VIDEO_REF_NUMBER_FOR_PFRAMES: return "Reference Frames for a P-Frame"; case V4L2_CID_MPEG_VIDEO_PREPEND_SPSPPS_TO_IDR: return "Prepend SPS and PPS to IDR"; /* AV1 controls */ case V4L2_CID_MPEG_VIDEO_AV1_PROFILE: return "AV1 Profile"; case V4L2_CID_MPEG_VIDEO_AV1_LEVEL: return "AV1 Level"; /* CAMERA controls */ /* Keep the order of the 'case's the same as in v4l2-controls.h! */ case V4L2_CID_CAMERA_CLASS: return "Camera Controls"; case V4L2_CID_EXPOSURE_AUTO: return "Auto Exposure"; case V4L2_CID_EXPOSURE_ABSOLUTE: return "Exposure Time, Absolute"; case V4L2_CID_EXPOSURE_AUTO_PRIORITY: return "Exposure, Dynamic Framerate"; case V4L2_CID_PAN_RELATIVE: return "Pan, Relative"; case V4L2_CID_TILT_RELATIVE: return "Tilt, Relative"; case V4L2_CID_PAN_RESET: return "Pan, Reset"; case V4L2_CID_TILT_RESET: return "Tilt, Reset"; case V4L2_CID_PAN_ABSOLUTE: return "Pan, Absolute"; case V4L2_CID_TILT_ABSOLUTE: return "Tilt, Absolute"; case V4L2_CID_FOCUS_ABSOLUTE: return "Focus, Absolute"; case V4L2_CID_FOCUS_RELATIVE: return "Focus, Relative"; case V4L2_CID_FOCUS_AUTO: return "Focus, Automatic Continuous"; case V4L2_CID_ZOOM_ABSOLUTE: return "Zoom, Absolute"; case V4L2_CID_ZOOM_RELATIVE: return "Zoom, Relative"; case V4L2_CID_ZOOM_CONTINUOUS: return "Zoom, Continuous"; case V4L2_CID_PRIVACY: return "Privacy"; case V4L2_CID_IRIS_ABSOLUTE: return "Iris, Absolute"; case V4L2_CID_IRIS_RELATIVE: return "Iris, Relative"; case V4L2_CID_AUTO_EXPOSURE_BIAS: return "Auto Exposure, Bias"; case V4L2_CID_AUTO_N_PRESET_WHITE_BALANCE: return "White Balance, Auto & Preset"; case V4L2_CID_WIDE_DYNAMIC_RANGE: return "Wide Dynamic Range"; case V4L2_CID_IMAGE_STABILIZATION: return "Image Stabilization"; case V4L2_CID_ISO_SENSITIVITY: return "ISO Sensitivity"; case V4L2_CID_ISO_SENSITIVITY_AUTO: return "ISO Sensitivity, Auto"; case V4L2_CID_EXPOSURE_METERING: return "Exposure, Metering Mode"; case V4L2_CID_SCENE_MODE: return "Scene Mode"; case V4L2_CID_3A_LOCK: return "3A Lock"; case V4L2_CID_AUTO_FOCUS_START: return "Auto Focus, Start"; case V4L2_CID_AUTO_FOCUS_STOP: return "Auto Focus, Stop"; case V4L2_CID_AUTO_FOCUS_STATUS: return "Auto Focus, Status"; case V4L2_CID_AUTO_FOCUS_RANGE: return "Auto Focus, Range"; case V4L2_CID_PAN_SPEED: return "Pan, Speed"; case V4L2_CID_TILT_SPEED: return "Tilt, Speed"; case V4L2_CID_UNIT_CELL_SIZE: return "Unit Cell Size"; case V4L2_CID_CAMERA_ORIENTATION: return "Camera Orientation"; case V4L2_CID_CAMERA_SENSOR_ROTATION: return "Camera Sensor Rotation"; case V4L2_CID_HDR_SENSOR_MODE: return "HDR Sensor Mode"; /* FM Radio Modulator controls */ /* Keep the order of the 'case's the same as in v4l2-controls.h! */ case V4L2_CID_FM_TX_CLASS: return "FM Radio Modulator Controls"; case V4L2_CID_RDS_TX_DEVIATION: return "RDS Signal Deviation"; case V4L2_CID_RDS_TX_PI: return "RDS Program ID"; case V4L2_CID_RDS_TX_PTY: return "RDS Program Type"; case V4L2_CID_RDS_TX_PS_NAME: return "RDS PS Name"; case V4L2_CID_RDS_TX_RADIO_TEXT: return "RDS Radio Text"; case V4L2_CID_RDS_TX_MONO_STEREO: return "RDS Stereo"; case V4L2_CID_RDS_TX_ARTIFICIAL_HEAD: return "RDS Artificial Head"; case V4L2_CID_RDS_TX_COMPRESSED: return "RDS Compressed"; case V4L2_CID_RDS_TX_DYNAMIC_PTY: return "RDS Dynamic PTY"; case V4L2_CID_RDS_TX_TRAFFIC_ANNOUNCEMENT: return "RDS Traffic Announcement"; case V4L2_CID_RDS_TX_TRAFFIC_PROGRAM: return "RDS Traffic Program"; case V4L2_CID_RDS_TX_MUSIC_SPEECH: return "RDS Music"; case V4L2_CID_RDS_TX_ALT_FREQS_ENABLE: return "RDS Enable Alt Frequencies"; case V4L2_CID_RDS_TX_ALT_FREQS: return "RDS Alternate Frequencies"; case V4L2_CID_AUDIO_LIMITER_ENABLED: return "Audio Limiter Feature Enabled"; case V4L2_CID_AUDIO_LIMITER_RELEASE_TIME: return "Audio Limiter Release Time"; case V4L2_CID_AUDIO_LIMITER_DEVIATION: return "Audio Limiter Deviation"; case V4L2_CID_AUDIO_COMPRESSION_ENABLED: return "Audio Compression Enabled"; case V4L2_CID_AUDIO_COMPRESSION_GAIN: return "Audio Compression Gain"; case V4L2_CID_AUDIO_COMPRESSION_THRESHOLD: return "Audio Compression Threshold"; case V4L2_CID_AUDIO_COMPRESSION_ATTACK_TIME: return "Audio Compression Attack Time"; case V4L2_CID_AUDIO_COMPRESSION_RELEASE_TIME: return "Audio Compression Release Time"; case V4L2_CID_PILOT_TONE_ENABLED: return "Pilot Tone Feature Enabled"; case V4L2_CID_PILOT_TONE_DEVIATION: return "Pilot Tone Deviation"; case V4L2_CID_PILOT_TONE_FREQUENCY: return "Pilot Tone Frequency"; case V4L2_CID_TUNE_PREEMPHASIS: return "Pre-Emphasis"; case V4L2_CID_TUNE_POWER_LEVEL: return "Tune Power Level"; case V4L2_CID_TUNE_ANTENNA_CAPACITOR: return "Tune Antenna Capacitor"; /* Flash controls */ /* Keep the order of the 'case's the same as in v4l2-controls.h! */ case V4L2_CID_FLASH_CLASS: return "Flash Controls"; case V4L2_CID_FLASH_LED_MODE: return "LED Mode"; case V4L2_CID_FLASH_STROBE_SOURCE: return "Strobe Source"; case V4L2_CID_FLASH_STROBE: return "Strobe"; case V4L2_CID_FLASH_STROBE_STOP: return "Stop Strobe"; case V4L2_CID_FLASH_STROBE_STATUS: return "Strobe Status"; case V4L2_CID_FLASH_TIMEOUT: return "Strobe Timeout"; case V4L2_CID_FLASH_INTENSITY: return "Intensity, Flash Mode"; case V4L2_CID_FLASH_TORCH_INTENSITY: return "Intensity, Torch Mode"; case V4L2_CID_FLASH_INDICATOR_INTENSITY: return "Intensity, Indicator"; case V4L2_CID_FLASH_FAULT: return "Faults"; case V4L2_CID_FLASH_CHARGE: return "Charge"; case V4L2_CID_FLASH_READY: return "Ready to Strobe"; /* JPEG encoder controls */ /* Keep the order of the 'case's the same as in v4l2-controls.h! */ case V4L2_CID_JPEG_CLASS: return "JPEG Compression Controls"; case V4L2_CID_JPEG_CHROMA_SUBSAMPLING: return "Chroma Subsampling"; case V4L2_CID_JPEG_RESTART_INTERVAL: return "Restart Interval"; case V4L2_CID_JPEG_COMPRESSION_QUALITY: return "Compression Quality"; case V4L2_CID_JPEG_ACTIVE_MARKER: return "Active Markers"; /* Image source controls */ /* Keep the order of the 'case's the same as in v4l2-controls.h! */ case V4L2_CID_IMAGE_SOURCE_CLASS: return "Image Source Controls"; case V4L2_CID_VBLANK: return "Vertical Blanking"; case V4L2_CID_HBLANK: return "Horizontal Blanking"; case V4L2_CID_ANALOGUE_GAIN: return "Analogue Gain"; case V4L2_CID_TEST_PATTERN_RED: return "Red Pixel Value"; case V4L2_CID_TEST_PATTERN_GREENR: return "Green (Red) Pixel Value"; case V4L2_CID_TEST_PATTERN_BLUE: return "Blue Pixel Value"; case V4L2_CID_TEST_PATTERN_GREENB: return "Green (Blue) Pixel Value"; case V4L2_CID_NOTIFY_GAINS: return "Notify Gains"; /* Image processing controls */ /* Keep the order of the 'case's the same as in v4l2-controls.h! */ case V4L2_CID_IMAGE_PROC_CLASS: return "Image Processing Controls"; case V4L2_CID_LINK_FREQ: return "Link Frequency"; case V4L2_CID_PIXEL_RATE: return "Pixel Rate"; case V4L2_CID_TEST_PATTERN: return "Test Pattern"; case V4L2_CID_DEINTERLACING_MODE: return "Deinterlacing Mode"; case V4L2_CID_DIGITAL_GAIN: return "Digital Gain"; /* DV controls */ /* Keep the order of the 'case's the same as in v4l2-controls.h! */ case V4L2_CID_DV_CLASS: return "Digital Video Controls"; case V4L2_CID_DV_TX_HOTPLUG: return "Hotplug Present"; case V4L2_CID_DV_TX_RXSENSE: return "RxSense Present"; case V4L2_CID_DV_TX_EDID_PRESENT: return "EDID Present"; case V4L2_CID_DV_TX_MODE: return "Transmit Mode"; case V4L2_CID_DV_TX_RGB_RANGE: return "Tx RGB Quantization Range"; case V4L2_CID_DV_TX_IT_CONTENT_TYPE: return "Tx IT Content Type"; case V4L2_CID_DV_RX_POWER_PRESENT: return "Power Present"; case V4L2_CID_DV_RX_RGB_RANGE: return "Rx RGB Quantization Range"; case V4L2_CID_DV_RX_IT_CONTENT_TYPE: return "Rx IT Content Type"; case V4L2_CID_FM_RX_CLASS: return "FM Radio Receiver Controls"; case V4L2_CID_TUNE_DEEMPHASIS: return "De-Emphasis"; case V4L2_CID_RDS_RECEPTION: return "RDS Reception"; case V4L2_CID_RF_TUNER_CLASS: return "RF Tuner Controls"; case V4L2_CID_RF_TUNER_RF_GAIN: return "RF Gain"; case V4L2_CID_RF_TUNER_LNA_GAIN_AUTO: return "LNA Gain, Auto"; case V4L2_CID_RF_TUNER_LNA_GAIN: return "LNA Gain"; case V4L2_CID_RF_TUNER_MIXER_GAIN_AUTO: return "Mixer Gain, Auto"; case V4L2_CID_RF_TUNER_MIXER_GAIN: return "Mixer Gain"; case V4L2_CID_RF_TUNER_IF_GAIN_AUTO: return "IF Gain, Auto"; case V4L2_CID_RF_TUNER_IF_GAIN: return "IF Gain"; case V4L2_CID_RF_TUNER_BANDWIDTH_AUTO: return "Bandwidth, Auto"; case V4L2_CID_RF_TUNER_BANDWIDTH: return "Bandwidth"; case V4L2_CID_RF_TUNER_PLL_LOCK: return "PLL Lock"; case V4L2_CID_RDS_RX_PTY: return "RDS Program Type"; case V4L2_CID_RDS_RX_PS_NAME: return "RDS PS Name"; case V4L2_CID_RDS_RX_RADIO_TEXT: return "RDS Radio Text"; case V4L2_CID_RDS_RX_TRAFFIC_ANNOUNCEMENT: return "RDS Traffic Announcement"; case V4L2_CID_RDS_RX_TRAFFIC_PROGRAM: return "RDS Traffic Program"; case V4L2_CID_RDS_RX_MUSIC_SPEECH: return "RDS Music"; /* Detection controls */ /* Keep the order of the 'case's the same as in v4l2-controls.h! */ case V4L2_CID_DETECT_CLASS: return "Detection Controls"; case V4L2_CID_DETECT_MD_MODE: return "Motion Detection Mode"; case V4L2_CID_DETECT_MD_GLOBAL_THRESHOLD: return "MD Global Threshold"; case V4L2_CID_DETECT_MD_THRESHOLD_GRID: return "MD Threshold Grid"; case V4L2_CID_DETECT_MD_REGION_GRID: return "MD Region Grid"; /* Stateless Codec controls */ /* Keep the order of the 'case's the same as in v4l2-controls.h! */ case V4L2_CID_CODEC_STATELESS_CLASS: return "Stateless Codec Controls"; case V4L2_CID_STATELESS_H264_DECODE_MODE: return "H264 Decode Mode"; case V4L2_CID_STATELESS_H264_START_CODE: return "H264 Start Code"; case V4L2_CID_STATELESS_H264_SPS: return "H264 Sequence Parameter Set"; case V4L2_CID_STATELESS_H264_PPS: return "H264 Picture Parameter Set"; case V4L2_CID_STATELESS_H264_SCALING_MATRIX: return "H264 Scaling Matrix"; case V4L2_CID_STATELESS_H264_PRED_WEIGHTS: return "H264 Prediction Weight Table"; case V4L2_CID_STATELESS_H264_SLICE_PARAMS: return "H264 Slice Parameters"; case V4L2_CID_STATELESS_H264_DECODE_PARAMS: return "H264 Decode Parameters"; case V4L2_CID_STATELESS_FWHT_PARAMS: return "FWHT Stateless Parameters"; case V4L2_CID_STATELESS_VP8_FRAME: return "VP8 Frame Parameters"; case V4L2_CID_STATELESS_MPEG2_SEQUENCE: return "MPEG-2 Sequence Header"; case V4L2_CID_STATELESS_MPEG2_PICTURE: return "MPEG-2 Picture Header"; case V4L2_CID_STATELESS_MPEG2_QUANTISATION: return "MPEG-2 Quantisation Matrices"; case V4L2_CID_STATELESS_VP9_COMPRESSED_HDR: return "VP9 Probabilities Updates"; case V4L2_CID_STATELESS_VP9_FRAME: return "VP9 Frame Decode Parameters"; case V4L2_CID_STATELESS_HEVC_SPS: return "HEVC Sequence Parameter Set"; case V4L2_CID_STATELESS_HEVC_PPS: return "HEVC Picture Parameter Set"; case V4L2_CID_STATELESS_HEVC_SLICE_PARAMS: return "HEVC Slice Parameters"; case V4L2_CID_STATELESS_HEVC_SCALING_MATRIX: return "HEVC Scaling Matrix"; case V4L2_CID_STATELESS_HEVC_DECODE_PARAMS: return "HEVC Decode Parameters"; case V4L2_CID_STATELESS_HEVC_DECODE_MODE: return "HEVC Decode Mode"; case V4L2_CID_STATELESS_HEVC_START_CODE: return "HEVC Start Code"; case V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS: return "HEVC Entry Point Offsets"; case V4L2_CID_STATELESS_AV1_SEQUENCE: return "AV1 Sequence Parameters"; case V4L2_CID_STATELESS_AV1_TILE_GROUP_ENTRY: return "AV1 Tile Group Entry"; case V4L2_CID_STATELESS_AV1_FRAME: return "AV1 Frame Parameters"; case V4L2_CID_STATELESS_AV1_FILM_GRAIN: return "AV1 Film Grain"; /* Colorimetry controls */ /* Keep the order of the 'case's the same as in v4l2-controls.h! */ case V4L2_CID_COLORIMETRY_CLASS: return "Colorimetry Controls"; case V4L2_CID_COLORIMETRY_HDR10_CLL_INFO: return "HDR10 Content Light Info"; case V4L2_CID_COLORIMETRY_HDR10_MASTERING_DISPLAY: return "HDR10 Mastering Display"; default: return NULL; } } EXPORT_SYMBOL(v4l2_ctrl_get_name); void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type, s64 *min, s64 *max, u64 *step, s64 *def, u32 *flags) { *name = v4l2_ctrl_get_name(id); *flags = 0; switch (id) { case V4L2_CID_AUDIO_MUTE: case V4L2_CID_AUDIO_LOUDNESS: case V4L2_CID_AUTO_WHITE_BALANCE: case V4L2_CID_AUTOGAIN: case V4L2_CID_HFLIP: case V4L2_CID_VFLIP: case V4L2_CID_HUE_AUTO: case V4L2_CID_CHROMA_AGC: case V4L2_CID_COLOR_KILLER: case V4L2_CID_AUTOBRIGHTNESS: case V4L2_CID_MPEG_AUDIO_MUTE: case V4L2_CID_MPEG_VIDEO_MUTE: case V4L2_CID_MPEG_VIDEO_GOP_CLOSURE: case V4L2_CID_MPEG_VIDEO_PULLDOWN: case V4L2_CID_EXPOSURE_AUTO_PRIORITY: case V4L2_CID_FOCUS_AUTO: case V4L2_CID_PRIVACY: case V4L2_CID_AUDIO_LIMITER_ENABLED: case V4L2_CID_AUDIO_COMPRESSION_ENABLED: case V4L2_CID_PILOT_TONE_ENABLED: case V4L2_CID_ILLUMINATORS_1: case V4L2_CID_ILLUMINATORS_2: case V4L2_CID_FLASH_STROBE_STATUS: case V4L2_CID_FLASH_CHARGE: case V4L2_CID_FLASH_READY: case V4L2_CID_MPEG_VIDEO_DECODER_MPEG4_DEBLOCK_FILTER: case V4L2_CID_MPEG_VIDEO_DECODER_SLICE_INTERFACE: case V4L2_CID_MPEG_VIDEO_DEC_DISPLAY_DELAY_ENABLE: case V4L2_CID_MPEG_VIDEO_FRAME_RC_ENABLE: case V4L2_CID_MPEG_VIDEO_MB_RC_ENABLE: case V4L2_CID_MPEG_VIDEO_H264_8X8_TRANSFORM: case V4L2_CID_MPEG_VIDEO_H264_VUI_SAR_ENABLE: case V4L2_CID_MPEG_VIDEO_MPEG4_QPEL: case V4L2_CID_MPEG_VIDEO_REPEAT_SEQ_HEADER: case V4L2_CID_MPEG_VIDEO_AU_DELIMITER: case V4L2_CID_WIDE_DYNAMIC_RANGE: case V4L2_CID_IMAGE_STABILIZATION: case V4L2_CID_RDS_RECEPTION: case V4L2_CID_RF_TUNER_LNA_GAIN_AUTO: case V4L2_CID_RF_TUNER_MIXER_GAIN_AUTO: case V4L2_CID_RF_TUNER_IF_GAIN_AUTO: case V4L2_CID_RF_TUNER_BANDWIDTH_AUTO: case V4L2_CID_RF_TUNER_PLL_LOCK: case V4L2_CID_RDS_TX_MONO_STEREO: case V4L2_CID_RDS_TX_ARTIFICIAL_HEAD: case V4L2_CID_RDS_TX_COMPRESSED: case V4L2_CID_RDS_TX_DYNAMIC_PTY: case V4L2_CID_RDS_TX_TRAFFIC_ANNOUNCEMENT: case V4L2_CID_RDS_TX_TRAFFIC_PROGRAM: case V4L2_CID_RDS_TX_MUSIC_SPEECH: case V4L2_CID_RDS_TX_ALT_FREQS_ENABLE: case V4L2_CID_RDS_RX_TRAFFIC_ANNOUNCEMENT: case V4L2_CID_RDS_RX_TRAFFIC_PROGRAM: case V4L2_CID_RDS_RX_MUSIC_SPEECH: *type = V4L2_CTRL_TYPE_BOOLEAN; *min = 0; *max = *step = 1; break; case V4L2_CID_ROTATE: *type = V4L2_CTRL_TYPE_INTEGER; *flags |= V4L2_CTRL_FLAG_MODIFY_LAYOUT; break; case V4L2_CID_MPEG_VIDEO_MV_H_SEARCH_RANGE: case V4L2_CID_MPEG_VIDEO_MV_V_SEARCH_RANGE: case V4L2_CID_MPEG_VIDEO_DEC_DISPLAY_DELAY: case V4L2_CID_MPEG_VIDEO_INTRA_REFRESH_PERIOD: *type = V4L2_CTRL_TYPE_INTEGER; break; case V4L2_CID_MPEG_VIDEO_LTR_COUNT: *type = V4L2_CTRL_TYPE_INTEGER; break; case V4L2_CID_MPEG_VIDEO_FRAME_LTR_INDEX: *type = V4L2_CTRL_TYPE_INTEGER; *flags |= V4L2_CTRL_FLAG_EXECUTE_ON_WRITE; break; case V4L2_CID_MPEG_VIDEO_USE_LTR_FRAMES: *type = V4L2_CTRL_TYPE_BITMASK; *flags |= V4L2_CTRL_FLAG_EXECUTE_ON_WRITE; break; case V4L2_CID_MPEG_VIDEO_FORCE_KEY_FRAME: case V4L2_CID_PAN_RESET: case V4L2_CID_TILT_RESET: case V4L2_CID_FLASH_STROBE: case V4L2_CID_FLASH_STROBE_STOP: case V4L2_CID_AUTO_FOCUS_START: case V4L2_CID_AUTO_FOCUS_STOP: case V4L2_CID_DO_WHITE_BALANCE: *type = V4L2_CTRL_TYPE_BUTTON; *flags |= V4L2_CTRL_FLAG_WRITE_ONLY | V4L2_CTRL_FLAG_EXECUTE_ON_WRITE; *min = *max = *step = *def = 0; break; case V4L2_CID_POWER_LINE_FREQUENCY: case V4L2_CID_MPEG_AUDIO_SAMPLING_FREQ: case V4L2_CID_MPEG_AUDIO_ENCODING: case V4L2_CID_MPEG_AUDIO_L1_BITRATE: case V4L2_CID_MPEG_AUDIO_L2_BITRATE: case V4L2_CID_MPEG_AUDIO_L3_BITRATE: case V4L2_CID_MPEG_AUDIO_AC3_BITRATE: case V4L2_CID_MPEG_AUDIO_MODE: case V4L2_CID_MPEG_AUDIO_MODE_EXTENSION: case V4L2_CID_MPEG_AUDIO_EMPHASIS: case V4L2_CID_MPEG_AUDIO_CRC: case V4L2_CID_MPEG_AUDIO_DEC_PLAYBACK: case V4L2_CID_MPEG_AUDIO_DEC_MULTILINGUAL_PLAYBACK: case V4L2_CID_MPEG_VIDEO_ENCODING: case V4L2_CID_MPEG_VIDEO_ASPECT: case V4L2_CID_MPEG_VIDEO_BITRATE_MODE: case V4L2_CID_MPEG_STREAM_TYPE: case V4L2_CID_MPEG_STREAM_VBI_FMT: case V4L2_CID_EXPOSURE_AUTO: case V4L2_CID_AUTO_FOCUS_RANGE: case V4L2_CID_COLORFX: case V4L2_CID_AUTO_N_PRESET_WHITE_BALANCE: case V4L2_CID_TUNE_PREEMPHASIS: case V4L2_CID_FLASH_LED_MODE: case V4L2_CID_FLASH_STROBE_SOURCE: case V4L2_CID_MPEG_VIDEO_HEADER_MODE: case V4L2_CID_MPEG_VIDEO_FRAME_SKIP_MODE: case V4L2_CID_MPEG_VIDEO_MULTI_SLICE_MODE: case V4L2_CID_MPEG_VIDEO_H264_ENTROPY_MODE: case V4L2_CID_MPEG_VIDEO_H264_LEVEL: case V4L2_CID_MPEG_VIDEO_H264_LOOP_FILTER_MODE: case V4L2_CID_MPEG_VIDEO_H264_PROFILE: case V4L2_CID_MPEG_VIDEO_H264_VUI_SAR_IDC: case V4L2_CID_MPEG_VIDEO_H264_SEI_FP_ARRANGEMENT_TYPE: case V4L2_CID_MPEG_VIDEO_H264_FMO_MAP_TYPE: case V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_TYPE: case V4L2_CID_MPEG_VIDEO_MPEG2_LEVEL: case V4L2_CID_MPEG_VIDEO_MPEG2_PROFILE: case V4L2_CID_MPEG_VIDEO_MPEG4_LEVEL: case V4L2_CID_MPEG_VIDEO_MPEG4_PROFILE: case V4L2_CID_JPEG_CHROMA_SUBSAMPLING: case V4L2_CID_ISO_SENSITIVITY_AUTO: case V4L2_CID_EXPOSURE_METERING: case V4L2_CID_SCENE_MODE: case V4L2_CID_DV_TX_MODE: case V4L2_CID_DV_TX_RGB_RANGE: case V4L2_CID_DV_TX_IT_CONTENT_TYPE: case V4L2_CID_DV_RX_RGB_RANGE: case V4L2_CID_DV_RX_IT_CONTENT_TYPE: case V4L2_CID_TEST_PATTERN: case V4L2_CID_DEINTERLACING_MODE: case V4L2_CID_TUNE_DEEMPHASIS: case V4L2_CID_MPEG_VIDEO_VPX_GOLDEN_FRAME_SEL: case V4L2_CID_MPEG_VIDEO_VP8_PROFILE: case V4L2_CID_MPEG_VIDEO_VP9_PROFILE: case V4L2_CID_MPEG_VIDEO_VP9_LEVEL: case V4L2_CID_DETECT_MD_MODE: case V4L2_CID_MPEG_VIDEO_HEVC_PROFILE: case V4L2_CID_MPEG_VIDEO_HEVC_LEVEL: case V4L2_CID_MPEG_VIDEO_HEVC_HIER_CODING_TYPE: case V4L2_CID_MPEG_VIDEO_HEVC_REFRESH_TYPE: case V4L2_CID_MPEG_VIDEO_HEVC_SIZE_OF_LENGTH_FIELD: case V4L2_CID_MPEG_VIDEO_HEVC_TIER: case V4L2_CID_MPEG_VIDEO_HEVC_LOOP_FILTER_MODE: case V4L2_CID_MPEG_VIDEO_AV1_PROFILE: case V4L2_CID_MPEG_VIDEO_AV1_LEVEL: case V4L2_CID_STATELESS_HEVC_DECODE_MODE: case V4L2_CID_STATELESS_HEVC_START_CODE: case V4L2_CID_STATELESS_H264_DECODE_MODE: case V4L2_CID_STATELESS_H264_START_CODE: case V4L2_CID_CAMERA_ORIENTATION: case V4L2_CID_MPEG_VIDEO_INTRA_REFRESH_PERIOD_TYPE: case V4L2_CID_HDR_SENSOR_MODE: *type = V4L2_CTRL_TYPE_MENU; break; case V4L2_CID_LINK_FREQ: *type = V4L2_CTRL_TYPE_INTEGER_MENU; break; case V4L2_CID_RDS_TX_PS_NAME: case V4L2_CID_RDS_TX_RADIO_TEXT: case V4L2_CID_RDS_RX_PS_NAME: case V4L2_CID_RDS_RX_RADIO_TEXT: *type = V4L2_CTRL_TYPE_STRING; break; case V4L2_CID_ISO_SENSITIVITY: case V4L2_CID_AUTO_EXPOSURE_BIAS: case V4L2_CID_MPEG_VIDEO_VPX_NUM_PARTITIONS: case V4L2_CID_MPEG_VIDEO_VPX_NUM_REF_FRAMES: *type = V4L2_CTRL_TYPE_INTEGER_MENU; break; case V4L2_CID_USER_CLASS: case V4L2_CID_CAMERA_CLASS: case V4L2_CID_CODEC_CLASS: case V4L2_CID_FM_TX_CLASS: case V4L2_CID_FLASH_CLASS: case V4L2_CID_JPEG_CLASS: case V4L2_CID_IMAGE_SOURCE_CLASS: case V4L2_CID_IMAGE_PROC_CLASS: case V4L2_CID_DV_CLASS: case V4L2_CID_FM_RX_CLASS: case V4L2_CID_RF_TUNER_CLASS: case V4L2_CID_DETECT_CLASS: case V4L2_CID_CODEC_STATELESS_CLASS: case V4L2_CID_COLORIMETRY_CLASS: *type = V4L2_CTRL_TYPE_CTRL_CLASS; /* You can neither read nor write these */ *flags |= V4L2_CTRL_FLAG_READ_ONLY | V4L2_CTRL_FLAG_WRITE_ONLY; *min = *max = *step = *def = 0; break; case V4L2_CID_BG_COLOR: case V4L2_CID_COLORFX_RGB: *type = V4L2_CTRL_TYPE_INTEGER; *step = 1; *min = 0; /* Max is calculated as RGB888 that is 2^24 - 1 */ *max = 0xffffff; break; case V4L2_CID_COLORFX_CBCR: *type = V4L2_CTRL_TYPE_INTEGER; *step = 1; *min = 0; *max = 0xffff; break; case V4L2_CID_FLASH_FAULT: case V4L2_CID_JPEG_ACTIVE_MARKER: case V4L2_CID_3A_LOCK: case V4L2_CID_AUTO_FOCUS_STATUS: case V4L2_CID_DV_TX_HOTPLUG: case V4L2_CID_DV_TX_RXSENSE: case V4L2_CID_DV_TX_EDID_PRESENT: case V4L2_CID_DV_RX_POWER_PRESENT: *type = V4L2_CTRL_TYPE_BITMASK; break; case V4L2_CID_MIN_BUFFERS_FOR_CAPTURE: case V4L2_CID_MIN_BUFFERS_FOR_OUTPUT: *type = V4L2_CTRL_TYPE_INTEGER; *flags |= V4L2_CTRL_FLAG_READ_ONLY; break; case V4L2_CID_MPEG_VIDEO_DEC_PTS: *type = V4L2_CTRL_TYPE_INTEGER64; *flags |= V4L2_CTRL_FLAG_VOLATILE | V4L2_CTRL_FLAG_READ_ONLY; *min = *def = 0; *max = 0x1ffffffffLL; *step = 1; break; case V4L2_CID_MPEG_VIDEO_DEC_FRAME: *type = V4L2_CTRL_TYPE_INTEGER64; *flags |= V4L2_CTRL_FLAG_VOLATILE | V4L2_CTRL_FLAG_READ_ONLY; *min = *def = 0; *max = 0x7fffffffffffffffLL; *step = 1; break; case V4L2_CID_MPEG_VIDEO_DEC_CONCEAL_COLOR: *type = V4L2_CTRL_TYPE_INTEGER64; *min = 0; /* default for 8 bit black, luma is 16, chroma is 128 */ *def = 0x8000800010LL; *max = 0xffffffffffffLL; *step = 1; break; case V4L2_CID_MPEG_VIDEO_AVERAGE_QP: *type = V4L2_CTRL_TYPE_INTEGER; *flags |= V4L2_CTRL_FLAG_READ_ONLY; break; case V4L2_CID_PIXEL_RATE: *type = V4L2_CTRL_TYPE_INTEGER64; *flags |= V4L2_CTRL_FLAG_READ_ONLY; break; case V4L2_CID_DETECT_MD_REGION_GRID: *type = V4L2_CTRL_TYPE_U8; break; case V4L2_CID_DETECT_MD_THRESHOLD_GRID: *type = V4L2_CTRL_TYPE_U16; break; case V4L2_CID_RDS_TX_ALT_FREQS: *type = V4L2_CTRL_TYPE_U32; break; case V4L2_CID_STATELESS_MPEG2_SEQUENCE: *type = V4L2_CTRL_TYPE_MPEG2_SEQUENCE; break; case V4L2_CID_STATELESS_MPEG2_PICTURE: *type = V4L2_CTRL_TYPE_MPEG2_PICTURE; break; case V4L2_CID_STATELESS_MPEG2_QUANTISATION: *type = V4L2_CTRL_TYPE_MPEG2_QUANTISATION; break; case V4L2_CID_STATELESS_FWHT_PARAMS: *type = V4L2_CTRL_TYPE_FWHT_PARAMS; break; case V4L2_CID_STATELESS_H264_SPS: *type = V4L2_CTRL_TYPE_H264_SPS; break; case V4L2_CID_STATELESS_H264_PPS: *type = V4L2_CTRL_TYPE_H264_PPS; break; case V4L2_CID_STATELESS_H264_SCALING_MATRIX: *type = V4L2_CTRL_TYPE_H264_SCALING_MATRIX; break; case V4L2_CID_STATELESS_H264_SLICE_PARAMS: *type = V4L2_CTRL_TYPE_H264_SLICE_PARAMS; break; case V4L2_CID_STATELESS_H264_DECODE_PARAMS: *type = V4L2_CTRL_TYPE_H264_DECODE_PARAMS; break; case V4L2_CID_STATELESS_H264_PRED_WEIGHTS: *type = V4L2_CTRL_TYPE_H264_PRED_WEIGHTS; break; case V4L2_CID_STATELESS_VP8_FRAME: *type = V4L2_CTRL_TYPE_VP8_FRAME; break; case V4L2_CID_STATELESS_HEVC_SPS: *type = V4L2_CTRL_TYPE_HEVC_SPS; break; case V4L2_CID_STATELESS_HEVC_PPS: *type = V4L2_CTRL_TYPE_HEVC_PPS; break; case V4L2_CID_STATELESS_HEVC_SLICE_PARAMS: *type = V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS; *flags |= V4L2_CTRL_FLAG_DYNAMIC_ARRAY; break; case V4L2_CID_STATELESS_HEVC_SCALING_MATRIX: *type = V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX; break; case V4L2_CID_STATELESS_HEVC_DECODE_PARAMS: *type = V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS; break; case V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS: *type = V4L2_CTRL_TYPE_U32; *flags |= V4L2_CTRL_FLAG_DYNAMIC_ARRAY; break; case V4L2_CID_STATELESS_VP9_COMPRESSED_HDR: *type = V4L2_CTRL_TYPE_VP9_COMPRESSED_HDR; break; case V4L2_CID_STATELESS_VP9_FRAME: *type = V4L2_CTRL_TYPE_VP9_FRAME; break; case V4L2_CID_STATELESS_AV1_SEQUENCE: *type = V4L2_CTRL_TYPE_AV1_SEQUENCE; break; case V4L2_CID_STATELESS_AV1_TILE_GROUP_ENTRY: *type = V4L2_CTRL_TYPE_AV1_TILE_GROUP_ENTRY; *flags |= V4L2_CTRL_FLAG_DYNAMIC_ARRAY; break; case V4L2_CID_STATELESS_AV1_FRAME: *type = V4L2_CTRL_TYPE_AV1_FRAME; break; case V4L2_CID_STATELESS_AV1_FILM_GRAIN: *type = V4L2_CTRL_TYPE_AV1_FILM_GRAIN; break; case V4L2_CID_UNIT_CELL_SIZE: *type = V4L2_CTRL_TYPE_AREA; *flags |= V4L2_CTRL_FLAG_READ_ONLY; break; case V4L2_CID_COLORIMETRY_HDR10_CLL_INFO: *type = V4L2_CTRL_TYPE_HDR10_CLL_INFO; break; case V4L2_CID_COLORIMETRY_HDR10_MASTERING_DISPLAY: *type = V4L2_CTRL_TYPE_HDR10_MASTERING_DISPLAY; break; default: *type = V4L2_CTRL_TYPE_INTEGER; break; } switch (id) { case V4L2_CID_MPEG_AUDIO_ENCODING: case V4L2_CID_MPEG_AUDIO_MODE: case V4L2_CID_MPEG_VIDEO_BITRATE_MODE: case V4L2_CID_MPEG_VIDEO_B_FRAMES: case V4L2_CID_MPEG_STREAM_TYPE: *flags |= V4L2_CTRL_FLAG_UPDATE; break; case V4L2_CID_AUDIO_VOLUME: case V4L2_CID_AUDIO_BALANCE: case V4L2_CID_AUDIO_BASS: case V4L2_CID_AUDIO_TREBLE: case V4L2_CID_BRIGHTNESS: case V4L2_CID_CONTRAST: case V4L2_CID_SATURATION: case V4L2_CID_HUE: case V4L2_CID_RED_BALANCE: case V4L2_CID_BLUE_BALANCE: case V4L2_CID_GAMMA: case V4L2_CID_SHARPNESS: case V4L2_CID_CHROMA_GAIN: case V4L2_CID_RDS_TX_DEVIATION: case V4L2_CID_AUDIO_LIMITER_RELEASE_TIME: case V4L2_CID_AUDIO_LIMITER_DEVIATION: case V4L2_CID_AUDIO_COMPRESSION_GAIN: case V4L2_CID_AUDIO_COMPRESSION_THRESHOLD: case V4L2_CID_AUDIO_COMPRESSION_ATTACK_TIME: case V4L2_CID_AUDIO_COMPRESSION_RELEASE_TIME: case V4L2_CID_PILOT_TONE_DEVIATION: case V4L2_CID_PILOT_TONE_FREQUENCY: case V4L2_CID_TUNE_POWER_LEVEL: case V4L2_CID_TUNE_ANTENNA_CAPACITOR: case V4L2_CID_RF_TUNER_RF_GAIN: case V4L2_CID_RF_TUNER_LNA_GAIN: case V4L2_CID_RF_TUNER_MIXER_GAIN: case V4L2_CID_RF_TUNER_IF_GAIN: case V4L2_CID_RF_TUNER_BANDWIDTH: case V4L2_CID_DETECT_MD_GLOBAL_THRESHOLD: *flags |= V4L2_CTRL_FLAG_SLIDER; break; case V4L2_CID_PAN_RELATIVE: case V4L2_CID_TILT_RELATIVE: case V4L2_CID_FOCUS_RELATIVE: case V4L2_CID_IRIS_RELATIVE: case V4L2_CID_ZOOM_RELATIVE: *flags |= V4L2_CTRL_FLAG_WRITE_ONLY | V4L2_CTRL_FLAG_EXECUTE_ON_WRITE; break; case V4L2_CID_FLASH_STROBE_STATUS: case V4L2_CID_AUTO_FOCUS_STATUS: case V4L2_CID_FLASH_READY: case V4L2_CID_DV_TX_HOTPLUG: case V4L2_CID_DV_TX_RXSENSE: case V4L2_CID_DV_TX_EDID_PRESENT: case V4L2_CID_DV_RX_POWER_PRESENT: case V4L2_CID_DV_RX_IT_CONTENT_TYPE: case V4L2_CID_RDS_RX_PTY: case V4L2_CID_RDS_RX_PS_NAME: case V4L2_CID_RDS_RX_RADIO_TEXT: case V4L2_CID_RDS_RX_TRAFFIC_ANNOUNCEMENT: case V4L2_CID_RDS_RX_TRAFFIC_PROGRAM: case V4L2_CID_RDS_RX_MUSIC_SPEECH: case V4L2_CID_CAMERA_ORIENTATION: case V4L2_CID_CAMERA_SENSOR_ROTATION: *flags |= V4L2_CTRL_FLAG_READ_ONLY; break; case V4L2_CID_RF_TUNER_PLL_LOCK: *flags |= V4L2_CTRL_FLAG_VOLATILE; break; } } EXPORT_SYMBOL(v4l2_ctrl_fill); |
| 2 2 2 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk> * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/blkdev.h> #include <linux/blktrace_api.h> #include <linux/percpu.h> #include <linux/init.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/debugfs.h> #include <linux/export.h> #include <linux/time.h> #include <linux/uaccess.h> #include <linux/list.h> #include <linux/blk-cgroup.h> #include "../../block/blk.h" #include <trace/events/block.h> #include "trace_output.h" #ifdef CONFIG_BLK_DEV_IO_TRACE static unsigned int blktrace_seq __read_mostly = 1; static struct trace_array *blk_tr; static bool blk_tracer_enabled __read_mostly; static LIST_HEAD(running_trace_list); static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(running_trace_lock); /* Select an alternative, minimalistic output than the original one */ #define TRACE_BLK_OPT_CLASSIC 0x1 #define TRACE_BLK_OPT_CGROUP 0x2 #define TRACE_BLK_OPT_CGNAME 0x4 static struct tracer_opt blk_tracer_opts[] = { /* Default disable the minimalistic output */ { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) }, #ifdef CONFIG_BLK_CGROUP { TRACER_OPT(blk_cgroup, TRACE_BLK_OPT_CGROUP) }, { TRACER_OPT(blk_cgname, TRACE_BLK_OPT_CGNAME) }, #endif { } }; static struct tracer_flags blk_tracer_flags = { .val = 0, .opts = blk_tracer_opts, }; /* Global reference count of probes */ static DEFINE_MUTEX(blk_probe_mutex); static int blk_probes_ref; static void blk_register_tracepoints(void); static void blk_unregister_tracepoints(void); /* * Send out a notify message. */ static void trace_note(struct blk_trace *bt, pid_t pid, int action, const void *data, size_t len, u64 cgid) { struct blk_io_trace *t; struct ring_buffer_event *event = NULL; struct trace_buffer *buffer = NULL; unsigned int trace_ctx = 0; int cpu = smp_processor_id(); bool blk_tracer = blk_tracer_enabled; ssize_t cgid_len = cgid ? sizeof(cgid) : 0; if (blk_tracer) { buffer = blk_tr->array_buffer.buffer; trace_ctx = tracing_gen_ctx_flags(0); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, sizeof(*t) + len + cgid_len, trace_ctx); if (!event) return; t = ring_buffer_event_data(event); goto record_it; } if (!bt->rchan) return; t = relay_reserve(bt->rchan, sizeof(*t) + len + cgid_len); if (t) { t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; t->time = ktime_to_ns(ktime_get()); record_it: t->device = bt->dev; t->action = action | (cgid ? __BLK_TN_CGROUP : 0); t->pid = pid; t->cpu = cpu; t->pdu_len = len + cgid_len; if (cgid_len) memcpy((void *)t + sizeof(*t), &cgid, cgid_len); memcpy((void *) t + sizeof(*t) + cgid_len, data, len); if (blk_tracer) trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); } } /* * Send out a notify for this process, if we haven't done so since a trace * started */ static void trace_note_tsk(struct task_struct *tsk) { unsigned long flags; struct blk_trace *bt; tsk->btrace_seq = blktrace_seq; raw_spin_lock_irqsave(&running_trace_lock, flags); list_for_each_entry(bt, &running_trace_list, running_list) { trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm), 0); } raw_spin_unlock_irqrestore(&running_trace_lock, flags); } static void trace_note_time(struct blk_trace *bt) { struct timespec64 now; unsigned long flags; u32 words[2]; /* need to check user space to see if this breaks in y2038 or y2106 */ ktime_get_real_ts64(&now); words[0] = (u32)now.tv_sec; words[1] = now.tv_nsec; local_irq_save(flags); trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), 0); local_irq_restore(flags); } void __blk_trace_note_message(struct blk_trace *bt, struct cgroup_subsys_state *css, const char *fmt, ...) { int n; va_list args; unsigned long flags; char *buf; u64 cgid = 0; if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer_enabled)) return; /* * If the BLK_TC_NOTIFY action mask isn't set, don't send any note * message to the trace. */ if (!(bt->act_mask & BLK_TC_NOTIFY)) return; local_irq_save(flags); buf = this_cpu_ptr(bt->msg_data); va_start(args, fmt); n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); va_end(args); #ifdef CONFIG_BLK_CGROUP if (css && (blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) cgid = cgroup_id(css->cgroup); else cgid = 1; #endif trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, cgid); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(__blk_trace_note_message); static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, pid_t pid) { if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0) return 1; if (sector && (sector < bt->start_lba || sector > bt->end_lba)) return 1; if (bt->pid && pid != bt->pid) return 1; return 0; } /* * Data direction bit lookup */ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) }; #define BLK_TC_RAHEAD BLK_TC_AHEAD #define BLK_TC_PREFLUSH BLK_TC_FLUSH /* The ilog2() calls fall out because they're constant */ #define MASK_TC_BIT(rw, __name) ((__force u32)(rw & REQ_ ## __name) << \ (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name)) /* * The worker for the various blk_add_trace*() types. Fills out a * blk_io_trace structure and places it in a per-cpu subbuffer. */ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, const blk_opf_t opf, u32 what, int error, int pdu_len, void *pdu_data, u64 cgid) { struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; struct trace_buffer *buffer = NULL; struct blk_io_trace *t; unsigned long flags = 0; unsigned long *sequence; unsigned int trace_ctx = 0; pid_t pid; int cpu; bool blk_tracer = blk_tracer_enabled; ssize_t cgid_len = cgid ? sizeof(cgid) : 0; const enum req_op op = opf & REQ_OP_MASK; if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) return; what |= ddir_act[op_is_write(op) ? WRITE : READ]; what |= MASK_TC_BIT(opf, SYNC); what |= MASK_TC_BIT(opf, RAHEAD); what |= MASK_TC_BIT(opf, META); what |= MASK_TC_BIT(opf, PREFLUSH); what |= MASK_TC_BIT(opf, FUA); if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE) what |= BLK_TC_ACT(BLK_TC_DISCARD); if (op == REQ_OP_FLUSH) what |= BLK_TC_ACT(BLK_TC_FLUSH); if (cgid) what |= __BLK_TA_CGROUP; pid = tsk->pid; if (act_log_check(bt, what, sector, pid)) return; cpu = raw_smp_processor_id(); if (blk_tracer) { tracing_record_cmdline(current); buffer = blk_tr->array_buffer.buffer; trace_ctx = tracing_gen_ctx_flags(0); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, sizeof(*t) + pdu_len + cgid_len, trace_ctx); if (!event) return; t = ring_buffer_event_data(event); goto record_it; } if (unlikely(tsk->btrace_seq != blktrace_seq)) trace_note_tsk(tsk); /* * A word about the locking here - we disable interrupts to reserve * some space in the relay per-cpu buffer, to prevent an irq * from coming in and stepping on our toes. */ local_irq_save(flags); t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len + cgid_len); if (t) { sequence = per_cpu_ptr(bt->sequence, cpu); t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; t->sequence = ++(*sequence); t->time = ktime_to_ns(ktime_get()); record_it: /* * These two are not needed in ftrace as they are in the * generic trace_entry, filled by tracing_generic_entry_update, * but for the trace_event->bin() synthesizer benefit we do it * here too. */ t->cpu = cpu; t->pid = pid; t->sector = sector; t->bytes = bytes; t->action = what; t->device = bt->dev; t->error = error; t->pdu_len = pdu_len + cgid_len; if (cgid_len) memcpy((void *)t + sizeof(*t), &cgid, cgid_len); if (pdu_len) memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); if (blk_tracer) { trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); return; } } local_irq_restore(flags); } static void blk_trace_free(struct request_queue *q, struct blk_trace *bt) { relay_close(bt->rchan); /* * If 'bt->dir' is not set, then both 'dropped' and 'msg' are created * under 'q->debugfs_dir', thus lookup and remove them. */ if (!bt->dir) { debugfs_lookup_and_remove("dropped", q->debugfs_dir); debugfs_lookup_and_remove("msg", q->debugfs_dir); } else { debugfs_remove(bt->dir); } free_percpu(bt->sequence); free_percpu(bt->msg_data); kfree(bt); } static void get_probe_ref(void) { mutex_lock(&blk_probe_mutex); if (++blk_probes_ref == 1) blk_register_tracepoints(); mutex_unlock(&blk_probe_mutex); } static void put_probe_ref(void) { mutex_lock(&blk_probe_mutex); if (!--blk_probes_ref) blk_unregister_tracepoints(); mutex_unlock(&blk_probe_mutex); } static int blk_trace_start(struct blk_trace *bt) { if (bt->trace_state != Blktrace_setup && bt->trace_state != Blktrace_stopped) return -EINVAL; blktrace_seq++; smp_mb(); bt->trace_state = Blktrace_running; raw_spin_lock_irq(&running_trace_lock); list_add(&bt->running_list, &running_trace_list); raw_spin_unlock_irq(&running_trace_lock); trace_note_time(bt); return 0; } static int blk_trace_stop(struct blk_trace *bt) { if (bt->trace_state != Blktrace_running) return -EINVAL; bt->trace_state = Blktrace_stopped; raw_spin_lock_irq(&running_trace_lock); list_del_init(&bt->running_list); raw_spin_unlock_irq(&running_trace_lock); relay_flush(bt->rchan); return 0; } static void blk_trace_cleanup(struct request_queue *q, struct blk_trace *bt) { blk_trace_stop(bt); synchronize_rcu(); blk_trace_free(q, bt); put_probe_ref(); } static int __blk_trace_remove(struct request_queue *q) { struct blk_trace *bt; bt = rcu_replace_pointer(q->blk_trace, NULL, lockdep_is_held(&q->debugfs_mutex)); if (!bt) return -EINVAL; blk_trace_cleanup(q, bt); return 0; } int blk_trace_remove(struct request_queue *q) { int ret; mutex_lock(&q->debugfs_mutex); ret = __blk_trace_remove(q); mutex_unlock(&q->debugfs_mutex); return ret; } EXPORT_SYMBOL_GPL(blk_trace_remove); static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, size_t count, loff_t *ppos) { struct blk_trace *bt = filp->private_data; char buf[16]; snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped)); return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf)); } static const struct file_operations blk_dropped_fops = { .owner = THIS_MODULE, .open = simple_open, .read = blk_dropped_read, .llseek = default_llseek, }; static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, size_t count, loff_t *ppos) { char *msg; struct blk_trace *bt; if (count >= BLK_TN_MAX_MSG) return -EINVAL; msg = memdup_user_nul(buffer, count); if (IS_ERR(msg)) return PTR_ERR(msg); bt = filp->private_data; __blk_trace_note_message(bt, NULL, "%s", msg); kfree(msg); return count; } static const struct file_operations blk_msg_fops = { .owner = THIS_MODULE, .open = simple_open, .write = blk_msg_write, .llseek = noop_llseek, }; /* * Keep track of how many times we encountered a full subbuffer, to aid * the user space app in telling how many lost events there were. */ static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, void *prev_subbuf, size_t prev_padding) { struct blk_trace *bt; if (!relay_buf_full(buf)) return 1; bt = buf->chan->private_data; atomic_inc(&bt->dropped); return 0; } static int blk_remove_buf_file_callback(struct dentry *dentry) { debugfs_remove(dentry); return 0; } static struct dentry *blk_create_buf_file_callback(const char *filename, struct dentry *parent, umode_t mode, struct rchan_buf *buf, int *is_global) { return debugfs_create_file(filename, mode, parent, buf, &relay_file_operations); } static const struct rchan_callbacks blk_relay_callbacks = { .subbuf_start = blk_subbuf_start_callback, .create_buf_file = blk_create_buf_file_callback, .remove_buf_file = blk_remove_buf_file_callback, }; static void blk_trace_setup_lba(struct blk_trace *bt, struct block_device *bdev) { if (bdev) { bt->start_lba = bdev->bd_start_sect; bt->end_lba = bdev->bd_start_sect + bdev_nr_sectors(bdev); } else { bt->start_lba = 0; bt->end_lba = -1ULL; } } /* * Setup everything required to start tracing */ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, struct blk_user_trace_setup *buts) { struct blk_trace *bt = NULL; struct dentry *dir = NULL; int ret; lockdep_assert_held(&q->debugfs_mutex); if (!buts->buf_size || !buts->buf_nr) return -EINVAL; strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE); /* * some device names have larger paths - convert the slashes * to underscores for this to work as expected */ strreplace(buts->name, '/', '_'); /* * bdev can be NULL, as with scsi-generic, this is a helpful as * we can be. */ if (rcu_dereference_protected(q->blk_trace, lockdep_is_held(&q->debugfs_mutex))) { pr_warn("Concurrent blktraces are not allowed on %s\n", buts->name); return -EBUSY; } bt = kzalloc(sizeof(*bt), GFP_KERNEL); if (!bt) return -ENOMEM; ret = -ENOMEM; bt->sequence = alloc_percpu(unsigned long); if (!bt->sequence) goto err; bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char)); if (!bt->msg_data) goto err; /* * When tracing the whole disk reuse the existing debugfs directory * created by the block layer on init. For partitions block devices, * and scsi-generic block devices we create a temporary new debugfs * directory that will be removed once the trace ends. */ if (bdev && !bdev_is_partition(bdev)) dir = q->debugfs_dir; else bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); /* * As blktrace relies on debugfs for its interface the debugfs directory * is required, contrary to the usual mantra of not checking for debugfs * files or directories. */ if (IS_ERR_OR_NULL(dir)) { pr_warn("debugfs_dir not present for %s so skipping\n", buts->name); ret = -ENOENT; goto err; } bt->dev = dev; atomic_set(&bt->dropped, 0); INIT_LIST_HEAD(&bt->running_list); ret = -EIO; debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops); debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops); bt->rchan = relay_open("trace", dir, buts->buf_size, buts->buf_nr, &blk_relay_callbacks, bt); if (!bt->rchan) goto err; bt->act_mask = buts->act_mask; if (!bt->act_mask) bt->act_mask = (u16) -1; blk_trace_setup_lba(bt, bdev); /* overwrite with user settings */ if (buts->start_lba) bt->start_lba = buts->start_lba; if (buts->end_lba) bt->end_lba = buts->end_lba; bt->pid = buts->pid; bt->trace_state = Blktrace_setup; rcu_assign_pointer(q->blk_trace, bt); get_probe_ref(); ret = 0; err: if (ret) blk_trace_free(q, bt); return ret; } static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, char __user *arg) { struct blk_user_trace_setup buts; int ret; ret = copy_from_user(&buts, arg, sizeof(buts)); if (ret) return -EFAULT; ret = do_blk_trace_setup(q, name, dev, bdev, &buts); if (ret) return ret; if (copy_to_user(arg, &buts, sizeof(buts))) { __blk_trace_remove(q); return -EFAULT; } return 0; } int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, char __user *arg) { int ret; mutex_lock(&q->debugfs_mutex); ret = __blk_trace_setup(q, name, dev, bdev, arg); mutex_unlock(&q->debugfs_mutex); return ret; } EXPORT_SYMBOL_GPL(blk_trace_setup); #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) static int compat_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, char __user *arg) { struct blk_user_trace_setup buts; struct compat_blk_user_trace_setup cbuts; int ret; if (copy_from_user(&cbuts, arg, sizeof(cbuts))) return -EFAULT; buts = (struct blk_user_trace_setup) { .act_mask = cbuts.act_mask, .buf_size = cbuts.buf_size, .buf_nr = cbuts.buf_nr, .start_lba = cbuts.start_lba, .end_lba = cbuts.end_lba, .pid = cbuts.pid, }; ret = do_blk_trace_setup(q, name, dev, bdev, &buts); if (ret) return ret; if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) { __blk_trace_remove(q); return -EFAULT; } return 0; } #endif static int __blk_trace_startstop(struct request_queue *q, int start) { struct blk_trace *bt; bt = rcu_dereference_protected(q->blk_trace, lockdep_is_held(&q->debugfs_mutex)); if (bt == NULL) return -EINVAL; if (start) return blk_trace_start(bt); else return blk_trace_stop(bt); } int blk_trace_startstop(struct request_queue *q, int start) { int ret; mutex_lock(&q->debugfs_mutex); ret = __blk_trace_startstop(q, start); mutex_unlock(&q->debugfs_mutex); return ret; } EXPORT_SYMBOL_GPL(blk_trace_startstop); /* * When reading or writing the blktrace sysfs files, the references to the * opened sysfs or device files should prevent the underlying block device * from being removed. So no further delete protection is really needed. */ /** * blk_trace_ioctl - handle the ioctls associated with tracing * @bdev: the block device * @cmd: the ioctl cmd * @arg: the argument data, if any * **/ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) { struct request_queue *q = bdev_get_queue(bdev); int ret, start = 0; char b[BDEVNAME_SIZE]; mutex_lock(&q->debugfs_mutex); switch (cmd) { case BLKTRACESETUP: snprintf(b, sizeof(b), "%pg", bdev); ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); break; #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) case BLKTRACESETUP32: snprintf(b, sizeof(b), "%pg", bdev); ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); break; #endif case BLKTRACESTART: start = 1; fallthrough; case BLKTRACESTOP: ret = __blk_trace_startstop(q, start); break; case BLKTRACETEARDOWN: ret = __blk_trace_remove(q); break; default: ret = -ENOTTY; break; } mutex_unlock(&q->debugfs_mutex); return ret; } /** * blk_trace_shutdown - stop and cleanup trace structures * @q: the request queue associated with the device * **/ void blk_trace_shutdown(struct request_queue *q) { if (rcu_dereference_protected(q->blk_trace, lockdep_is_held(&q->debugfs_mutex))) __blk_trace_remove(q); } #ifdef CONFIG_BLK_CGROUP static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { struct cgroup_subsys_state *blkcg_css; struct blk_trace *bt; /* We don't use the 'bt' value here except as an optimization... */ bt = rcu_dereference_protected(q->blk_trace, 1); if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) return 0; blkcg_css = bio_blkcg_css(bio); if (!blkcg_css) return 0; return cgroup_id(blkcg_css->cgroup); } #else static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { return 0; } #endif static u64 blk_trace_request_get_cgid(struct request *rq) { if (!rq->bio) return 0; /* Use the first bio */ return blk_trace_bio_get_cgid(rq->q, rq->bio); } /* * blktrace probes */ /** * blk_add_trace_rq - Add a trace for a request oriented action * @rq: the source request * @error: return status to log * @nr_bytes: number of completed bytes * @what: the action * @cgid: the cgroup info * * Description: * Records an action against a request. Will log the bio offset + size. * **/ static void blk_add_trace_rq(struct request *rq, blk_status_t error, unsigned int nr_bytes, u32 what, u64 cgid) { struct blk_trace *bt; rcu_read_lock(); bt = rcu_dereference(rq->q->blk_trace); if (likely(!bt)) { rcu_read_unlock(); return; } if (blk_rq_is_passthrough(rq)) what |= BLK_TC_ACT(BLK_TC_PC); else what |= BLK_TC_ACT(BLK_TC_FS); __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, rq->cmd_flags, what, blk_status_to_errno(error), 0, NULL, cgid); rcu_read_unlock(); } static void blk_add_trace_rq_insert(void *ignore, struct request *rq) { blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT, blk_trace_request_get_cgid(rq)); } static void blk_add_trace_rq_issue(void *ignore, struct request *rq) { blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE, blk_trace_request_get_cgid(rq)); } static void blk_add_trace_rq_merge(void *ignore, struct request *rq) { blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_BACKMERGE, blk_trace_request_get_cgid(rq)); } static void blk_add_trace_rq_requeue(void *ignore, struct request *rq) { blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE, blk_trace_request_get_cgid(rq)); } static void blk_add_trace_rq_complete(void *ignore, struct request *rq, blk_status_t error, unsigned int nr_bytes) { blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE, blk_trace_request_get_cgid(rq)); } /** * blk_add_trace_bio - Add a trace for a bio oriented action * @q: queue the io is for * @bio: the source bio * @what: the action * @error: error, if any * * Description: * Records an action against a bio. Will log the bio offset + size. * **/ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, u32 what, int error) { struct blk_trace *bt; rcu_read_lock(); bt = rcu_dereference(q->blk_trace); if (likely(!bt)) { rcu_read_unlock(); return; } __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio->bi_opf, what, error, 0, NULL, blk_trace_bio_get_cgid(q, bio)); rcu_read_unlock(); } static void blk_add_trace_bio_bounce(void *ignore, struct bio *bio) { blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BOUNCE, 0); } static void blk_add_trace_bio_complete(void *ignore, struct request_queue *q, struct bio *bio) { blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, blk_status_to_errno(bio->bi_status)); } static void blk_add_trace_bio_backmerge(void *ignore, struct bio *bio) { blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BACKMERGE, 0); } static void blk_add_trace_bio_frontmerge(void *ignore, struct bio *bio) { blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_FRONTMERGE, 0); } static void blk_add_trace_bio_queue(void *ignore, struct bio *bio) { blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_QUEUE, 0); } static void blk_add_trace_getrq(void *ignore, struct bio *bio) { blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_GETRQ, 0); } static void blk_add_trace_plug(void *ignore, struct request_queue *q) { struct blk_trace *bt; rcu_read_lock(); bt = rcu_dereference(q->blk_trace); if (bt) __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0); rcu_read_unlock(); } static void blk_add_trace_unplug(void *ignore, struct request_queue *q, unsigned int depth, bool explicit) { struct blk_trace *bt; rcu_read_lock(); bt = rcu_dereference(q->blk_trace); if (bt) { __be64 rpdu = cpu_to_be64(depth); u32 what; if (explicit) what = BLK_TA_UNPLUG_IO; else what = BLK_TA_UNPLUG_TIMER; __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0); } rcu_read_unlock(); } static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu) { struct request_queue *q = bio->bi_bdev->bd_disk->queue; struct blk_trace *bt; rcu_read_lock(); bt = rcu_dereference(q->blk_trace); if (bt) { __be64 rpdu = cpu_to_be64(pdu); __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio->bi_opf, BLK_TA_SPLIT, blk_status_to_errno(bio->bi_status), sizeof(rpdu), &rpdu, blk_trace_bio_get_cgid(q, bio)); } rcu_read_unlock(); } /** * blk_add_trace_bio_remap - Add a trace for a bio-remap operation * @ignore: trace callback data parameter (not used) * @bio: the source bio * @dev: source device * @from: source sector * * Called after a bio is remapped to a different device and/or sector. **/ static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev, sector_t from) { struct request_queue *q = bio->bi_bdev->bd_disk->queue; struct blk_trace *bt; struct blk_io_trace_remap r; rcu_read_lock(); bt = rcu_dereference(q->blk_trace); if (likely(!bt)) { rcu_read_unlock(); return; } r.device_from = cpu_to_be32(dev); r.device_to = cpu_to_be32(bio_dev(bio)); r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio->bi_opf, BLK_TA_REMAP, blk_status_to_errno(bio->bi_status), sizeof(r), &r, blk_trace_bio_get_cgid(q, bio)); rcu_read_unlock(); } /** * blk_add_trace_rq_remap - Add a trace for a request-remap operation * @ignore: trace callback data parameter (not used) * @rq: the source request * @dev: target device * @from: source sector * * Description: * Device mapper remaps request to other devices. * Add a trace for that action. * **/ static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev, sector_t from) { struct blk_trace *bt; struct blk_io_trace_remap r; rcu_read_lock(); bt = rcu_dereference(rq->q->blk_trace); if (likely(!bt)) { rcu_read_unlock(); return; } r.device_from = cpu_to_be32(dev); r.device_to = cpu_to_be32(disk_devt(rq->q->disk)); r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rq->cmd_flags, BLK_TA_REMAP, 0, sizeof(r), &r, blk_trace_request_get_cgid(rq)); rcu_read_unlock(); } /** * blk_add_driver_data - Add binary message with driver-specific data * @rq: io request * @data: driver-specific data * @len: length of driver-specific data * * Description: * Some drivers might want to write driver-specific data per request. * **/ void blk_add_driver_data(struct request *rq, void *data, size_t len) { struct blk_trace *bt; rcu_read_lock(); bt = rcu_dereference(rq->q->blk_trace); if (likely(!bt)) { rcu_read_unlock(); return; } __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, BLK_TA_DRV_DATA, 0, len, data, blk_trace_request_get_cgid(rq)); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(blk_add_driver_data); static void blk_register_tracepoints(void) { int ret; ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); WARN_ON(ret); ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); WARN_ON(ret); ret = register_trace_block_rq_merge(blk_add_trace_rq_merge, NULL); WARN_ON(ret); ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); WARN_ON(ret); ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); WARN_ON(ret); ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL); WARN_ON(ret); ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL); WARN_ON(ret); ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL); WARN_ON(ret); ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL); WARN_ON(ret); ret = register_trace_block_bio_queue(blk_add_trace_bio_queue, NULL); WARN_ON(ret); ret = register_trace_block_getrq(blk_add_trace_getrq, NULL); WARN_ON(ret); ret = register_trace_block_plug(blk_add_trace_plug, NULL); WARN_ON(ret); ret = register_trace_block_unplug(blk_add_trace_unplug, NULL); WARN_ON(ret); ret = register_trace_block_split(blk_add_trace_split, NULL); WARN_ON(ret); ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); WARN_ON(ret); ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); WARN_ON(ret); } static void blk_unregister_tracepoints(void) { unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); unregister_trace_block_split(blk_add_trace_split, NULL); unregister_trace_block_unplug(blk_add_trace_unplug, NULL); unregister_trace_block_plug(blk_add_trace_plug, NULL); unregister_trace_block_getrq(blk_add_trace_getrq, NULL); unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL); unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL); unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL); unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL); unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL); unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); unregister_trace_block_rq_merge(blk_add_trace_rq_merge, NULL); unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); tracepoint_synchronize_unregister(); } /* * struct blk_io_tracer formatting routines */ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) { int i = 0; int tc = t->action >> BLK_TC_SHIFT; if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) { rwbs[i++] = 'N'; goto out; } if (tc & BLK_TC_FLUSH) rwbs[i++] = 'F'; if (tc & BLK_TC_DISCARD) rwbs[i++] = 'D'; else if (tc & BLK_TC_WRITE) rwbs[i++] = 'W'; else if (t->bytes) rwbs[i++] = 'R'; else rwbs[i++] = 'N'; if (tc & BLK_TC_FUA) rwbs[i++] = 'F'; if (tc & BLK_TC_AHEAD) rwbs[i++] = 'A'; if (tc & BLK_TC_SYNC) rwbs[i++] = 'S'; if (tc & BLK_TC_META) rwbs[i++] = 'M'; out: rwbs[i] = '\0'; } static inline const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent) { return (const struct blk_io_trace *)ent; } static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg) { return (void *)(te_blk_io_trace(ent) + 1) + (has_cg ? sizeof(u64) : 0); } static inline u64 t_cgid(const struct trace_entry *ent) { return *(u64 *)(te_blk_io_trace(ent) + 1); } static inline int pdu_real_len(const struct trace_entry *ent, bool has_cg) { return te_blk_io_trace(ent)->pdu_len - (has_cg ? sizeof(u64) : 0); } static inline u32 t_action(const struct trace_entry *ent) { return te_blk_io_trace(ent)->action; } static inline u32 t_bytes(const struct trace_entry *ent) { return te_blk_io_trace(ent)->bytes; } static inline u32 t_sec(const struct trace_entry *ent) { return te_blk_io_trace(ent)->bytes >> 9; } static inline unsigned long long t_sector(const struct trace_entry *ent) { return te_blk_io_trace(ent)->sector; } static inline __u16 t_error(const struct trace_entry *ent) { return te_blk_io_trace(ent)->error; } static __u64 get_pdu_int(const struct trace_entry *ent, bool has_cg) { const __be64 *val = pdu_start(ent, has_cg); return be64_to_cpu(*val); } typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act, bool has_cg); static void blk_log_action_classic(struct trace_iterator *iter, const char *act, bool has_cg) { char rwbs[RWBS_LEN]; unsigned long long ts = iter->ts; unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC); unsigned secs = (unsigned long)ts; const struct blk_io_trace *t = te_blk_io_trace(iter->ent); fill_rwbs(rwbs, t); trace_seq_printf(&iter->seq, "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ", MAJOR(t->device), MINOR(t->device), iter->cpu, secs, nsec_rem, iter->ent->pid, act, rwbs); } static void blk_log_action(struct trace_iterator *iter, const char *act, bool has_cg) { char rwbs[RWBS_LEN]; const struct blk_io_trace *t = te_blk_io_trace(iter->ent); fill_rwbs(rwbs, t); if (has_cg) { u64 id = t_cgid(iter->ent); if (blk_tracer_flags.val & TRACE_BLK_OPT_CGNAME) { char blkcg_name_buf[NAME_MAX + 1] = "<...>"; cgroup_path_from_kernfs_id(id, blkcg_name_buf, sizeof(blkcg_name_buf)); trace_seq_printf(&iter->seq, "%3d,%-3d %s %2s %3s ", MAJOR(t->device), MINOR(t->device), blkcg_name_buf, act, rwbs); } else { /* * The cgid portion used to be "INO,GEN". Userland * builds a FILEID_INO32_GEN fid out of them and * opens the cgroup using open_by_handle_at(2). * While 32bit ino setups are still the same, 64bit * ones now use the 64bit ino as the whole ID and * no longer use generation. * * Regardless of the content, always output * "LOW32,HIGH32" so that FILEID_INO32_GEN fid can * be mapped back to @id on both 64 and 32bit ino * setups. See __kernfs_fh_to_dentry(). */ trace_seq_printf(&iter->seq, "%3d,%-3d %llx,%-llx %2s %3s ", MAJOR(t->device), MINOR(t->device), id & U32_MAX, id >> 32, act, rwbs); } } else trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", MAJOR(t->device), MINOR(t->device), act, rwbs); } static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { const unsigned char *pdu_buf; int pdu_len; int i, end; pdu_buf = pdu_start(ent, has_cg); pdu_len = pdu_real_len(ent, has_cg); if (!pdu_len) return; /* find the last zero that needs to be printed */ for (end = pdu_len - 1; end >= 0; end--) if (pdu_buf[end]) break; end++; trace_seq_putc(s, '('); for (i = 0; i < pdu_len; i++) { trace_seq_printf(s, "%s%02x", i == 0 ? "" : " ", pdu_buf[i]); /* * stop when the rest is just zeros and indicate so * with a ".." appended */ if (i == end && end != pdu_len - 1) { trace_seq_puts(s, " ..) "); return; } } trace_seq_puts(s, ") "); } static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { trace_seq_printf(s, "%u ", t_bytes(ent)); blk_log_dump_pdu(s, ent, has_cg); trace_seq_printf(s, "[%s]\n", cmd); } else { if (t_sec(ent)) trace_seq_printf(s, "%llu + %u [%s]\n", t_sector(ent), t_sec(ent), cmd); else trace_seq_printf(s, "[%s]\n", cmd); } } static void blk_log_with_error(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { blk_log_dump_pdu(s, ent, has_cg); trace_seq_printf(s, "[%d]\n", t_error(ent)); } else { if (t_sec(ent)) trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent), t_sec(ent), t_error(ent)); else trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent)); } } static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg); trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", t_sector(ent), t_sec(ent), MAJOR(be32_to_cpu(__r->device_from)), MINOR(be32_to_cpu(__r->device_from)), be64_to_cpu(__r->sector_from)); } static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); trace_seq_printf(s, "[%s]\n", cmd); } static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent, has_cg)); } static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), get_pdu_int(ent, has_cg), cmd); } static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { trace_seq_putmem(s, pdu_start(ent, has_cg), pdu_real_len(ent, has_cg)); trace_seq_putc(s, '\n'); } /* * struct tracer operations */ static void blk_tracer_print_header(struct seq_file *m) { if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) return; seq_puts(m, "# DEV CPU TIMESTAMP PID ACT FLG\n" "# | | | | | |\n"); } static void blk_tracer_start(struct trace_array *tr) { blk_tracer_enabled = true; } static int blk_tracer_init(struct trace_array *tr) { blk_tr = tr; blk_tracer_start(tr); return 0; } static void blk_tracer_stop(struct trace_array *tr) { blk_tracer_enabled = false; } static void blk_tracer_reset(struct trace_array *tr) { blk_tracer_stop(tr); } static const struct { const char *act[2]; void (*print)(struct trace_seq *s, const struct trace_entry *ent, bool has_cg); } what2act[] = { [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, [__BLK_TA_FRONTMERGE] = {{ "F", "frontmerge" }, blk_log_generic }, [__BLK_TA_GETRQ] = {{ "G", "getrq" }, blk_log_generic }, [__BLK_TA_SLEEPRQ] = {{ "S", "sleeprq" }, blk_log_generic }, [__BLK_TA_REQUEUE] = {{ "R", "requeue" }, blk_log_with_error }, [__BLK_TA_ISSUE] = {{ "D", "issue" }, blk_log_generic }, [__BLK_TA_COMPLETE] = {{ "C", "complete" }, blk_log_with_error }, [__BLK_TA_PLUG] = {{ "P", "plug" }, blk_log_plug }, [__BLK_TA_UNPLUG_IO] = {{ "U", "unplug_io" }, blk_log_unplug }, [__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug }, [__BLK_TA_INSERT] = {{ "I", "insert" }, blk_log_generic }, [__BLK_TA_SPLIT] = {{ "X", "split" }, blk_log_split }, [__BLK_TA_BOUNCE] = {{ "B", "bounce" }, blk_log_generic }, [__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap }, }; static enum print_line_t print_one_line(struct trace_iterator *iter, bool classic) { struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; const struct blk_io_trace *t; u16 what; bool long_act; blk_log_action_t *log_action; bool has_cg; t = te_blk_io_trace(iter->ent); what = (t->action & ((1 << BLK_TC_SHIFT) - 1)) & ~__BLK_TA_CGROUP; long_act = !!(tr->trace_flags & TRACE_ITER_VERBOSE); log_action = classic ? &blk_log_action_classic : &blk_log_action; has_cg = t->action & __BLK_TA_CGROUP; if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) { log_action(iter, long_act ? "message" : "m", has_cg); blk_log_msg(s, iter->ent, has_cg); return trace_handle_return(s); } if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) trace_seq_printf(s, "Unknown action %x\n", what); else { log_action(iter, what2act[what].act[long_act], has_cg); what2act[what].print(s, iter->ent, has_cg); } return trace_handle_return(s); } static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, int flags, struct trace_event *event) { return print_one_line(iter, false); } static void blk_trace_synthesize_old_trace(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; const int offset = offsetof(struct blk_io_trace, sector); struct blk_io_trace old = { .magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION, .time = iter->ts, }; trace_seq_putmem(s, &old, offset); trace_seq_putmem(s, &t->sector, sizeof(old) - offset + t->pdu_len); } static enum print_line_t blk_trace_event_print_binary(struct trace_iterator *iter, int flags, struct trace_event *event) { blk_trace_synthesize_old_trace(iter); return trace_handle_return(&iter->seq); } static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) { if ((iter->ent->type != TRACE_BLK) || !(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) return TRACE_TYPE_UNHANDLED; return print_one_line(iter, true); } static int blk_tracer_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { /* don't output context-info for blk_classic output */ if (bit == TRACE_BLK_OPT_CLASSIC) { if (set) tr->trace_flags &= ~TRACE_ITER_CONTEXT_INFO; else tr->trace_flags |= TRACE_ITER_CONTEXT_INFO; } return 0; } static struct tracer blk_tracer __read_mostly = { .name = "blk", .init = blk_tracer_init, .reset = blk_tracer_reset, .start = blk_tracer_start, .stop = blk_tracer_stop, .print_header = blk_tracer_print_header, .print_line = blk_tracer_print_line, .flags = &blk_tracer_flags, .set_flag = blk_tracer_set_flag, }; static struct trace_event_functions trace_blk_event_funcs = { .trace = blk_trace_event_print, .binary = blk_trace_event_print_binary, }; static struct trace_event trace_blk_event = { .type = TRACE_BLK, .funcs = &trace_blk_event_funcs, }; static int __init init_blk_tracer(void) { if (!register_trace_event(&trace_blk_event)) { pr_warn("Warning: could not register block events\n"); return 1; } if (register_tracer(&blk_tracer) != 0) { pr_warn("Warning: could not register the block tracer\n"); unregister_trace_event(&trace_blk_event); return 1; } return 0; } device_initcall(init_blk_tracer); static int blk_trace_remove_queue(struct request_queue *q) { struct blk_trace *bt; bt = rcu_replace_pointer(q->blk_trace, NULL, lockdep_is_held(&q->debugfs_mutex)); if (bt == NULL) return -EINVAL; blk_trace_stop(bt); put_probe_ref(); synchronize_rcu(); blk_trace_free(q, bt); return 0; } /* * Setup everything required to start tracing */ static int blk_trace_setup_queue(struct request_queue *q, struct block_device *bdev) { struct blk_trace *bt = NULL; int ret = -ENOMEM; bt = kzalloc(sizeof(*bt), GFP_KERNEL); if (!bt) return -ENOMEM; bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char)); if (!bt->msg_data) goto free_bt; bt->dev = bdev->bd_dev; bt->act_mask = (u16)-1; blk_trace_setup_lba(bt, bdev); rcu_assign_pointer(q->blk_trace, bt); get_probe_ref(); return 0; free_bt: blk_trace_free(q, bt); return ret; } /* * sysfs interface to enable and configure tracing */ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, struct device_attribute *attr, char *buf); static ssize_t sysfs_blk_trace_attr_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count); #define BLK_TRACE_DEVICE_ATTR(_name) \ DEVICE_ATTR(_name, S_IRUGO | S_IWUSR, \ sysfs_blk_trace_attr_show, \ sysfs_blk_trace_attr_store) static BLK_TRACE_DEVICE_ATTR(enable); static BLK_TRACE_DEVICE_ATTR(act_mask); static BLK_TRACE_DEVICE_ATTR(pid); static BLK_TRACE_DEVICE_ATTR(start_lba); static BLK_TRACE_DEVICE_ATTR(end_lba); static struct attribute *blk_trace_attrs[] = { &dev_attr_enable.attr, &dev_attr_act_mask.attr, &dev_attr_pid.attr, &dev_attr_start_lba.attr, &dev_attr_end_lba.attr, NULL }; struct attribute_group blk_trace_attr_group = { .name = "trace", .attrs = blk_trace_attrs, }; static const struct { int mask; const char *str; } mask_maps[] = { { BLK_TC_READ, "read" }, { BLK_TC_WRITE, "write" }, { BLK_TC_FLUSH, "flush" }, { BLK_TC_SYNC, "sync" }, { BLK_TC_QUEUE, "queue" }, { BLK_TC_REQUEUE, "requeue" }, { BLK_TC_ISSUE, "issue" }, { BLK_TC_COMPLETE, "complete" }, { BLK_TC_FS, "fs" }, { BLK_TC_PC, "pc" }, { BLK_TC_NOTIFY, "notify" }, { BLK_TC_AHEAD, "ahead" }, { BLK_TC_META, "meta" }, { BLK_TC_DISCARD, "discard" }, { BLK_TC_DRV_DATA, "drv_data" }, { BLK_TC_FUA, "fua" }, }; static int blk_trace_str2mask(const char *str) { int i; int mask = 0; char *buf, *s, *token; buf = kstrdup(str, GFP_KERNEL); if (buf == NULL) return -ENOMEM; s = strstrip(buf); while (1) { token = strsep(&s, ","); if (token == NULL) break; if (*token == '\0') continue; for (i = 0; i < ARRAY_SIZE(mask_maps); i++) { if (strcasecmp(token, mask_maps[i].str) == 0) { mask |= mask_maps[i].mask; break; } } if (i == ARRAY_SIZE(mask_maps)) { mask = -EINVAL; break; } } kfree(buf); return mask; } static ssize_t blk_trace_mask2str(char *buf, int mask) { int i; char *p = buf; for (i = 0; i < ARRAY_SIZE(mask_maps); i++) { if (mask & mask_maps[i].mask) { p += sprintf(p, "%s%s", (p == buf) ? "" : ",", mask_maps[i].str); } } *p++ = '\n'; return p - buf; } static ssize_t sysfs_blk_trace_attr_show(struct device *dev, struct device_attribute *attr, char *buf) { struct block_device *bdev = dev_to_bdev(dev); struct request_queue *q = bdev_get_queue(bdev); struct blk_trace *bt; ssize_t ret = -ENXIO; mutex_lock(&q->debugfs_mutex); bt = rcu_dereference_protected(q->blk_trace, lockdep_is_held(&q->debugfs_mutex)); if (attr == &dev_attr_enable) { ret = sprintf(buf, "%u\n", !!bt); goto out_unlock_bdev; } if (bt == NULL) ret = sprintf(buf, "disabled\n"); else if (attr == &dev_attr_act_mask) ret = blk_trace_mask2str(buf, bt->act_mask); else if (attr == &dev_attr_pid) ret = sprintf(buf, "%u\n", bt->pid); else if (attr == &dev_attr_start_lba) ret = sprintf(buf, "%llu\n", bt->start_lba); else if (attr == &dev_attr_end_lba) ret = sprintf(buf, "%llu\n", bt->end_lba); out_unlock_bdev: mutex_unlock(&q->debugfs_mutex); return ret; } static ssize_t sysfs_blk_trace_attr_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct block_device *bdev = dev_to_bdev(dev); struct request_queue *q = bdev_get_queue(bdev); struct blk_trace *bt; u64 value; ssize_t ret = -EINVAL; if (count == 0) goto out; if (attr == &dev_attr_act_mask) { if (kstrtoull(buf, 0, &value)) { /* Assume it is a list of trace category names */ ret = blk_trace_str2mask(buf); if (ret < 0) goto out; value = ret; } } else { if (kstrtoull(buf, 0, &value)) goto out; } mutex_lock(&q->debugfs_mutex); bt = rcu_dereference_protected(q->blk_trace, lockdep_is_held(&q->debugfs_mutex)); if (attr == &dev_attr_enable) { if (!!value == !!bt) { ret = 0; goto out_unlock_bdev; } if (value) ret = blk_trace_setup_queue(q, bdev); else ret = blk_trace_remove_queue(q); goto out_unlock_bdev; } ret = 0; if (bt == NULL) { ret = blk_trace_setup_queue(q, bdev); bt = rcu_dereference_protected(q->blk_trace, lockdep_is_held(&q->debugfs_mutex)); } if (ret == 0) { if (attr == &dev_attr_act_mask) bt->act_mask = value; else if (attr == &dev_attr_pid) bt->pid = value; else if (attr == &dev_attr_start_lba) bt->start_lba = value; else if (attr == &dev_attr_end_lba) bt->end_lba = value; } out_unlock_bdev: mutex_unlock(&q->debugfs_mutex); out: return ret ? ret : count; } #endif /* CONFIG_BLK_DEV_IO_TRACE */ #ifdef CONFIG_EVENT_TRACING /** * blk_fill_rwbs - Fill the buffer rwbs by mapping op to character string. * @rwbs: buffer to be filled * @opf: request operation type (REQ_OP_XXX) and flags for the tracepoint * * Description: * Maps each request operation and flag to a single character and fills the * buffer provided by the caller with resulting string. * **/ void blk_fill_rwbs(char *rwbs, blk_opf_t opf) { int i = 0; if (opf & REQ_PREFLUSH) rwbs[i++] = 'F'; switch (opf & REQ_OP_MASK) { case REQ_OP_WRITE: rwbs[i++] = 'W'; break; case REQ_OP_DISCARD: rwbs[i++] = 'D'; break; case REQ_OP_SECURE_ERASE: rwbs[i++] = 'D'; rwbs[i++] = 'E'; break; case REQ_OP_FLUSH: rwbs[i++] = 'F'; break; case REQ_OP_READ: rwbs[i++] = 'R'; break; default: rwbs[i++] = 'N'; } if (opf & REQ_FUA) rwbs[i++] = 'F'; if (opf & REQ_RAHEAD) rwbs[i++] = 'A'; if (opf & REQ_SYNC) rwbs[i++] = 'S'; if (opf & REQ_META) rwbs[i++] = 'M'; rwbs[i] = '\0'; } EXPORT_SYMBOL_GPL(blk_fill_rwbs); #endif /* CONFIG_EVENT_TRACING */ |
| 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 | // SPDX-License-Identifier: GPL-2.0 /* * SME code for cfg80211 * both driver SME event handling and the SME implementation * (for nl80211's connect() and wext) * * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> * Copyright (C) 2009, 2020, 2022-2024 Intel Corporation. All rights reserved. * Copyright 2017 Intel Deutschland GmbH */ #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/slab.h> #include <linux/workqueue.h> #include <linux/wireless.h> #include <linux/export.h> #include <net/iw_handler.h> #include <net/cfg80211.h> #include <net/rtnetlink.h> #include "nl80211.h" #include "reg.h" #include "rdev-ops.h" /* * Software SME in cfg80211, using auth/assoc/deauth calls to the * driver. This is for implementing nl80211's connect/disconnect * and wireless extensions (if configured.) */ struct cfg80211_conn { struct cfg80211_connect_params params; /* these are sub-states of the _CONNECTING sme_state */ enum { CFG80211_CONN_SCANNING, CFG80211_CONN_SCAN_AGAIN, CFG80211_CONN_AUTHENTICATE_NEXT, CFG80211_CONN_AUTHENTICATING, CFG80211_CONN_AUTH_FAILED_TIMEOUT, CFG80211_CONN_ASSOCIATE_NEXT, CFG80211_CONN_ASSOCIATING, CFG80211_CONN_ASSOC_FAILED, CFG80211_CONN_ASSOC_FAILED_TIMEOUT, CFG80211_CONN_DEAUTH, CFG80211_CONN_ABANDON, CFG80211_CONN_CONNECTED, } state; u8 bssid[ETH_ALEN], prev_bssid[ETH_ALEN]; const u8 *ie; size_t ie_len; bool auto_auth, prev_bssid_valid; }; static void cfg80211_sme_free(struct wireless_dev *wdev) { if (!wdev->conn) return; kfree(wdev->conn->ie); kfree(wdev->conn); wdev->conn = NULL; } static int cfg80211_conn_scan(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_scan_request *request; int n_channels, err; lockdep_assert_wiphy(wdev->wiphy); if (rdev->scan_req || rdev->scan_msg) return -EBUSY; if (wdev->conn->params.channel) n_channels = 1; else n_channels = ieee80211_get_num_supported_channels(wdev->wiphy); request = kzalloc(sizeof(*request) + sizeof(request->ssids[0]) + sizeof(request->channels[0]) * n_channels, GFP_KERNEL); if (!request) return -ENOMEM; request->n_channels = n_channels; if (wdev->conn->params.channel) { enum nl80211_band band = wdev->conn->params.channel->band; struct ieee80211_supported_band *sband = wdev->wiphy->bands[band]; if (!sband) { kfree(request); return -EINVAL; } request->channels[0] = wdev->conn->params.channel; request->rates[band] = (1 << sband->n_bitrates) - 1; } else { int i = 0, j; enum nl80211_band band; struct ieee80211_supported_band *bands; struct ieee80211_channel *channel; for (band = 0; band < NUM_NL80211_BANDS; band++) { bands = wdev->wiphy->bands[band]; if (!bands) continue; for (j = 0; j < bands->n_channels; j++) { channel = &bands->channels[j]; if (channel->flags & IEEE80211_CHAN_DISABLED) continue; request->channels[i++] = channel; } request->rates[band] = (1 << bands->n_bitrates) - 1; } n_channels = i; } request->n_channels = n_channels; request->ssids = (void *)request + struct_size(request, channels, n_channels); request->n_ssids = 1; memcpy(request->ssids[0].ssid, wdev->conn->params.ssid, wdev->conn->params.ssid_len); request->ssids[0].ssid_len = wdev->conn->params.ssid_len; eth_broadcast_addr(request->bssid); request->wdev = wdev; request->wiphy = &rdev->wiphy; request->scan_start = jiffies; rdev->scan_req = request; err = cfg80211_scan(rdev); if (!err) { wdev->conn->state = CFG80211_CONN_SCANNING; nl80211_send_scan_start(rdev, wdev); dev_hold(wdev->netdev); } else { rdev->scan_req = NULL; kfree(request); } return err; } static int cfg80211_conn_do_work(struct wireless_dev *wdev, enum nl80211_timeout_reason *treason) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_connect_params *params; struct cfg80211_auth_request auth_req = {}; struct cfg80211_assoc_request req = {}; int err; lockdep_assert_wiphy(wdev->wiphy); if (!wdev->conn) return 0; params = &wdev->conn->params; switch (wdev->conn->state) { case CFG80211_CONN_SCANNING: /* didn't find it during scan ... */ return -ENOENT; case CFG80211_CONN_SCAN_AGAIN: return cfg80211_conn_scan(wdev); case CFG80211_CONN_AUTHENTICATE_NEXT: if (WARN_ON(!rdev->ops->auth)) return -EOPNOTSUPP; wdev->conn->state = CFG80211_CONN_AUTHENTICATING; auth_req.key = params->key; auth_req.key_len = params->key_len; auth_req.key_idx = params->key_idx; auth_req.auth_type = params->auth_type; auth_req.bss = cfg80211_get_bss(&rdev->wiphy, params->channel, params->bssid, params->ssid, params->ssid_len, IEEE80211_BSS_TYPE_ESS, IEEE80211_PRIVACY_ANY); auth_req.link_id = -1; err = cfg80211_mlme_auth(rdev, wdev->netdev, &auth_req); cfg80211_put_bss(&rdev->wiphy, auth_req.bss); return err; case CFG80211_CONN_AUTH_FAILED_TIMEOUT: *treason = NL80211_TIMEOUT_AUTH; return -ENOTCONN; case CFG80211_CONN_ASSOCIATE_NEXT: if (WARN_ON(!rdev->ops->assoc)) return -EOPNOTSUPP; wdev->conn->state = CFG80211_CONN_ASSOCIATING; if (wdev->conn->prev_bssid_valid) req.prev_bssid = wdev->conn->prev_bssid; req.ie = params->ie; req.ie_len = params->ie_len; req.use_mfp = params->mfp != NL80211_MFP_NO; req.crypto = params->crypto; req.flags = params->flags; req.ht_capa = params->ht_capa; req.ht_capa_mask = params->ht_capa_mask; req.vht_capa = params->vht_capa; req.vht_capa_mask = params->vht_capa_mask; req.link_id = -1; req.bss = cfg80211_get_bss(&rdev->wiphy, params->channel, params->bssid, params->ssid, params->ssid_len, IEEE80211_BSS_TYPE_ESS, IEEE80211_PRIVACY_ANY); if (!req.bss) { err = -ENOENT; } else { err = cfg80211_mlme_assoc(rdev, wdev->netdev, &req, NULL); cfg80211_put_bss(&rdev->wiphy, req.bss); } if (err) cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid, NULL, 0, WLAN_REASON_DEAUTH_LEAVING, false); return err; case CFG80211_CONN_ASSOC_FAILED_TIMEOUT: *treason = NL80211_TIMEOUT_ASSOC; fallthrough; case CFG80211_CONN_ASSOC_FAILED: cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid, NULL, 0, WLAN_REASON_DEAUTH_LEAVING, false); return -ENOTCONN; case CFG80211_CONN_DEAUTH: cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid, NULL, 0, WLAN_REASON_DEAUTH_LEAVING, false); fallthrough; case CFG80211_CONN_ABANDON: /* free directly, disconnected event already sent */ cfg80211_sme_free(wdev); return 0; default: return 0; } } void cfg80211_conn_work(struct work_struct *work) { struct cfg80211_registered_device *rdev = container_of(work, struct cfg80211_registered_device, conn_work); struct wireless_dev *wdev; u8 bssid_buf[ETH_ALEN], *bssid = NULL; enum nl80211_timeout_reason treason; wiphy_lock(&rdev->wiphy); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (!wdev->netdev) continue; if (!netif_running(wdev->netdev)) continue; if (!wdev->conn || wdev->conn->state == CFG80211_CONN_CONNECTED) continue; if (wdev->conn->params.bssid) { memcpy(bssid_buf, wdev->conn->params.bssid, ETH_ALEN); bssid = bssid_buf; } treason = NL80211_TIMEOUT_UNSPECIFIED; if (cfg80211_conn_do_work(wdev, &treason)) { struct cfg80211_connect_resp_params cr; memset(&cr, 0, sizeof(cr)); cr.status = -1; cr.links[0].bssid = bssid; cr.timeout_reason = treason; __cfg80211_connect_result(wdev->netdev, &cr, false); } } wiphy_unlock(&rdev->wiphy); } static void cfg80211_step_auth_next(struct cfg80211_conn *conn, struct cfg80211_bss *bss) { memcpy(conn->bssid, bss->bssid, ETH_ALEN); conn->params.bssid = conn->bssid; conn->params.channel = bss->channel; conn->state = CFG80211_CONN_AUTHENTICATE_NEXT; } /* Returned bss is reference counted and must be cleaned up appropriately. */ static struct cfg80211_bss *cfg80211_get_conn_bss(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_bss *bss; lockdep_assert_wiphy(wdev->wiphy); bss = cfg80211_get_bss(wdev->wiphy, wdev->conn->params.channel, wdev->conn->params.bssid, wdev->conn->params.ssid, wdev->conn->params.ssid_len, wdev->conn_bss_type, IEEE80211_PRIVACY(wdev->conn->params.privacy)); if (!bss) return NULL; cfg80211_step_auth_next(wdev->conn, bss); schedule_work(&rdev->conn_work); return bss; } void cfg80211_sme_scan_done(struct net_device *dev) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_bss *bss; lockdep_assert_wiphy(wdev->wiphy); if (!wdev->conn) return; if (wdev->conn->state != CFG80211_CONN_SCANNING && wdev->conn->state != CFG80211_CONN_SCAN_AGAIN) return; bss = cfg80211_get_conn_bss(wdev); if (bss) cfg80211_put_bss(&rdev->wiphy, bss); else schedule_work(&rdev->conn_work); } void cfg80211_sme_rx_auth(struct wireless_dev *wdev, const u8 *buf, size_t len) { struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf; u16 status_code = le16_to_cpu(mgmt->u.auth.status_code); lockdep_assert_wiphy(wdev->wiphy); if (!wdev->conn || wdev->conn->state == CFG80211_CONN_CONNECTED) return; if (status_code == WLAN_STATUS_NOT_SUPPORTED_AUTH_ALG && wdev->conn->auto_auth && wdev->conn->params.auth_type != NL80211_AUTHTYPE_NETWORK_EAP) { /* select automatically between only open, shared, leap */ switch (wdev->conn->params.auth_type) { case NL80211_AUTHTYPE_OPEN_SYSTEM: if (wdev->connect_keys) wdev->conn->params.auth_type = NL80211_AUTHTYPE_SHARED_KEY; else wdev->conn->params.auth_type = NL80211_AUTHTYPE_NETWORK_EAP; break; case NL80211_AUTHTYPE_SHARED_KEY: wdev->conn->params.auth_type = NL80211_AUTHTYPE_NETWORK_EAP; break; default: /* huh? */ wdev->conn->params.auth_type = NL80211_AUTHTYPE_OPEN_SYSTEM; break; } wdev->conn->state = CFG80211_CONN_AUTHENTICATE_NEXT; schedule_work(&rdev->conn_work); } else if (status_code != WLAN_STATUS_SUCCESS) { struct cfg80211_connect_resp_params cr; memset(&cr, 0, sizeof(cr)); cr.status = status_code; cr.links[0].bssid = mgmt->bssid; cr.timeout_reason = NL80211_TIMEOUT_UNSPECIFIED; __cfg80211_connect_result(wdev->netdev, &cr, false); } else if (wdev->conn->state == CFG80211_CONN_AUTHENTICATING) { wdev->conn->state = CFG80211_CONN_ASSOCIATE_NEXT; schedule_work(&rdev->conn_work); } } bool cfg80211_sme_rx_assoc_resp(struct wireless_dev *wdev, u16 status) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); if (!wdev->conn) return false; if (status == WLAN_STATUS_SUCCESS) { wdev->conn->state = CFG80211_CONN_CONNECTED; return false; } if (wdev->conn->prev_bssid_valid) { /* * Some stupid APs don't accept reassoc, so we * need to fall back to trying regular assoc; * return true so no event is sent to userspace. */ wdev->conn->prev_bssid_valid = false; wdev->conn->state = CFG80211_CONN_ASSOCIATE_NEXT; schedule_work(&rdev->conn_work); return true; } wdev->conn->state = CFG80211_CONN_ASSOC_FAILED; schedule_work(&rdev->conn_work); return false; } void cfg80211_sme_deauth(struct wireless_dev *wdev) { cfg80211_sme_free(wdev); } void cfg80211_sme_auth_timeout(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); if (!wdev->conn) return; wdev->conn->state = CFG80211_CONN_AUTH_FAILED_TIMEOUT; schedule_work(&rdev->conn_work); } void cfg80211_sme_disassoc(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); if (!wdev->conn) return; wdev->conn->state = CFG80211_CONN_DEAUTH; schedule_work(&rdev->conn_work); } void cfg80211_sme_assoc_timeout(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); if (!wdev->conn) return; wdev->conn->state = CFG80211_CONN_ASSOC_FAILED_TIMEOUT; schedule_work(&rdev->conn_work); } void cfg80211_sme_abandon_assoc(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); if (!wdev->conn) return; wdev->conn->state = CFG80211_CONN_ABANDON; schedule_work(&rdev->conn_work); } static void cfg80211_wdev_release_bsses(struct wireless_dev *wdev) { unsigned int link; for_each_valid_link(wdev, link) { if (!wdev->links[link].client.current_bss) continue; cfg80211_unhold_bss(wdev->links[link].client.current_bss); cfg80211_put_bss(wdev->wiphy, &wdev->links[link].client.current_bss->pub); wdev->links[link].client.current_bss = NULL; } } void cfg80211_wdev_release_link_bsses(struct wireless_dev *wdev, u16 link_mask) { unsigned int link; for_each_valid_link(wdev, link) { if (!wdev->links[link].client.current_bss || !(link_mask & BIT(link))) continue; cfg80211_unhold_bss(wdev->links[link].client.current_bss); cfg80211_put_bss(wdev->wiphy, &wdev->links[link].client.current_bss->pub); wdev->links[link].client.current_bss = NULL; } } static int cfg80211_sme_get_conn_ies(struct wireless_dev *wdev, const u8 *ies, size_t ies_len, const u8 **out_ies, size_t *out_ies_len) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); u8 *buf; size_t offs; if (!rdev->wiphy.extended_capabilities_len || (ies && cfg80211_find_ie(WLAN_EID_EXT_CAPABILITY, ies, ies_len))) { *out_ies = kmemdup(ies, ies_len, GFP_KERNEL); if (!*out_ies) return -ENOMEM; *out_ies_len = ies_len; return 0; } buf = kmalloc(ies_len + rdev->wiphy.extended_capabilities_len + 2, GFP_KERNEL); if (!buf) return -ENOMEM; if (ies_len) { static const u8 before_extcapa[] = { /* not listing IEs expected to be created by driver */ WLAN_EID_RSN, WLAN_EID_QOS_CAPA, WLAN_EID_RRM_ENABLED_CAPABILITIES, WLAN_EID_MOBILITY_DOMAIN, WLAN_EID_SUPPORTED_REGULATORY_CLASSES, WLAN_EID_BSS_COEX_2040, }; offs = ieee80211_ie_split(ies, ies_len, before_extcapa, ARRAY_SIZE(before_extcapa), 0); memcpy(buf, ies, offs); /* leave a whole for extended capabilities IE */ memcpy(buf + offs + rdev->wiphy.extended_capabilities_len + 2, ies + offs, ies_len - offs); } else { offs = 0; } /* place extended capabilities IE (with only driver capabilities) */ buf[offs] = WLAN_EID_EXT_CAPABILITY; buf[offs + 1] = rdev->wiphy.extended_capabilities_len; memcpy(buf + offs + 2, rdev->wiphy.extended_capabilities, rdev->wiphy.extended_capabilities_len); *out_ies = buf; *out_ies_len = ies_len + rdev->wiphy.extended_capabilities_len + 2; return 0; } static int cfg80211_sme_connect(struct wireless_dev *wdev, struct cfg80211_connect_params *connect, const u8 *prev_bssid) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_bss *bss; int err; if (!rdev->ops->auth || !rdev->ops->assoc) return -EOPNOTSUPP; cfg80211_wdev_release_bsses(wdev); if (wdev->connected) { cfg80211_sme_free(wdev); wdev->connected = false; } if (wdev->conn) return -EINPROGRESS; wdev->conn = kzalloc(sizeof(*wdev->conn), GFP_KERNEL); if (!wdev->conn) return -ENOMEM; /* * Copy all parameters, and treat explicitly IEs, BSSID, SSID. */ memcpy(&wdev->conn->params, connect, sizeof(*connect)); if (connect->bssid) { wdev->conn->params.bssid = wdev->conn->bssid; memcpy(wdev->conn->bssid, connect->bssid, ETH_ALEN); } if (cfg80211_sme_get_conn_ies(wdev, connect->ie, connect->ie_len, &wdev->conn->ie, &wdev->conn->params.ie_len)) { kfree(wdev->conn); wdev->conn = NULL; return -ENOMEM; } wdev->conn->params.ie = wdev->conn->ie; if (connect->auth_type == NL80211_AUTHTYPE_AUTOMATIC) { wdev->conn->auto_auth = true; /* start with open system ... should mostly work */ wdev->conn->params.auth_type = NL80211_AUTHTYPE_OPEN_SYSTEM; } else { wdev->conn->auto_auth = false; } wdev->conn->params.ssid = wdev->u.client.ssid; wdev->conn->params.ssid_len = wdev->u.client.ssid_len; /* see if we have the bss already */ bss = cfg80211_get_bss(wdev->wiphy, wdev->conn->params.channel, wdev->conn->params.bssid, wdev->conn->params.ssid, wdev->conn->params.ssid_len, wdev->conn_bss_type, IEEE80211_PRIVACY(wdev->conn->params.privacy)); if (prev_bssid) { memcpy(wdev->conn->prev_bssid, prev_bssid, ETH_ALEN); wdev->conn->prev_bssid_valid = true; } /* we're good if we have a matching bss struct */ if (bss) { enum nl80211_timeout_reason treason; cfg80211_step_auth_next(wdev->conn, bss); err = cfg80211_conn_do_work(wdev, &treason); cfg80211_put_bss(wdev->wiphy, bss); } else { /* otherwise we'll need to scan for the AP first */ err = cfg80211_conn_scan(wdev); /* * If we can't scan right now, then we need to scan again * after the current scan finished, since the parameters * changed (unless we find a good AP anyway). */ if (err == -EBUSY) { err = 0; wdev->conn->state = CFG80211_CONN_SCAN_AGAIN; } } if (err) cfg80211_sme_free(wdev); return err; } static int cfg80211_sme_disconnect(struct wireless_dev *wdev, u16 reason) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int err; if (!wdev->conn) return 0; if (!rdev->ops->deauth) return -EOPNOTSUPP; if (wdev->conn->state == CFG80211_CONN_SCANNING || wdev->conn->state == CFG80211_CONN_SCAN_AGAIN) { err = 0; goto out; } /* wdev->conn->params.bssid must be set if > SCANNING */ err = cfg80211_mlme_deauth(rdev, wdev->netdev, wdev->conn->params.bssid, NULL, 0, reason, false); out: cfg80211_sme_free(wdev); return err; } /* * code shared for in-device and software SME */ static bool cfg80211_is_all_idle(void) { struct cfg80211_registered_device *rdev; struct wireless_dev *wdev; bool is_all_idle = true; /* * All devices must be idle as otherwise if you are actively * scanning some new beacon hints could be learned and would * count as new regulatory hints. * Also if there is any other active beaconing interface we * need not issue a disconnect hint and reset any info such * as chan dfs state, etc. */ for_each_rdev(rdev) { wiphy_lock(&rdev->wiphy); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (wdev->conn || wdev->connected || cfg80211_beaconing_iface_active(wdev)) is_all_idle = false; } wiphy_unlock(&rdev->wiphy); } return is_all_idle; } static void disconnect_work(struct work_struct *work) { rtnl_lock(); if (cfg80211_is_all_idle()) regulatory_hint_disconnect(); rtnl_unlock(); } DECLARE_WORK(cfg80211_disconnect_work, disconnect_work); static void cfg80211_connect_result_release_bsses(struct wireless_dev *wdev, struct cfg80211_connect_resp_params *cr) { unsigned int link; for_each_valid_link(cr, link) { if (!cr->links[link].bss) continue; cfg80211_unhold_bss(bss_from_pub(cr->links[link].bss)); cfg80211_put_bss(wdev->wiphy, cr->links[link].bss); } } /* * API calls for drivers implementing connect/disconnect and * SME event handling */ /* This method must consume bss one way or another */ void __cfg80211_connect_result(struct net_device *dev, struct cfg80211_connect_resp_params *cr, bool wextev) { struct wireless_dev *wdev = dev->ieee80211_ptr; const struct element *country_elem = NULL; const struct element *ssid; const u8 *country_data; u8 country_datalen; #ifdef CONFIG_CFG80211_WEXT union iwreq_data wrqu; #endif unsigned int link; const u8 *connected_addr; bool bss_not_found = false; lockdep_assert_wiphy(wdev->wiphy); if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION && wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)) goto out; if (cr->valid_links) { if (WARN_ON(!cr->ap_mld_addr)) goto out; for_each_valid_link(cr, link) { if (WARN_ON(!cr->links[link].addr)) goto out; } if (WARN_ON(wdev->connect_keys)) goto out; } wdev->unprot_beacon_reported = 0; nl80211_send_connect_result(wiphy_to_rdev(wdev->wiphy), dev, cr, GFP_KERNEL); connected_addr = cr->valid_links ? cr->ap_mld_addr : cr->links[0].bssid; #ifdef CONFIG_CFG80211_WEXT if (wextev && !cr->valid_links) { if (cr->req_ie && cr->status == WLAN_STATUS_SUCCESS) { memset(&wrqu, 0, sizeof(wrqu)); wrqu.data.length = cr->req_ie_len; wireless_send_event(dev, IWEVASSOCREQIE, &wrqu, cr->req_ie); } if (cr->resp_ie && cr->status == WLAN_STATUS_SUCCESS) { memset(&wrqu, 0, sizeof(wrqu)); wrqu.data.length = cr->resp_ie_len; wireless_send_event(dev, IWEVASSOCRESPIE, &wrqu, cr->resp_ie); } memset(&wrqu, 0, sizeof(wrqu)); wrqu.ap_addr.sa_family = ARPHRD_ETHER; if (connected_addr && cr->status == WLAN_STATUS_SUCCESS) { memcpy(wrqu.ap_addr.sa_data, connected_addr, ETH_ALEN); memcpy(wdev->wext.prev_bssid, connected_addr, ETH_ALEN); wdev->wext.prev_bssid_valid = true; } wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL); } #endif if (cr->status == WLAN_STATUS_SUCCESS) { if (!wiphy_to_rdev(wdev->wiphy)->ops->connect) { for_each_valid_link(cr, link) { if (WARN_ON_ONCE(!cr->links[link].bss)) break; } } for_each_valid_link(cr, link) { /* don't do extra lookups for failures */ if (cr->links[link].status != WLAN_STATUS_SUCCESS) continue; if (cr->links[link].bss) continue; cr->links[link].bss = cfg80211_get_bss(wdev->wiphy, NULL, cr->links[link].bssid, wdev->u.client.ssid, wdev->u.client.ssid_len, wdev->conn_bss_type, IEEE80211_PRIVACY_ANY); if (!cr->links[link].bss) { bss_not_found = true; break; } cfg80211_hold_bss(bss_from_pub(cr->links[link].bss)); } } cfg80211_wdev_release_bsses(wdev); if (cr->status != WLAN_STATUS_SUCCESS) { kfree_sensitive(wdev->connect_keys); wdev->connect_keys = NULL; wdev->u.client.ssid_len = 0; wdev->conn_owner_nlportid = 0; cfg80211_connect_result_release_bsses(wdev, cr); cfg80211_sme_free(wdev); return; } if (WARN_ON(bss_not_found)) { cfg80211_connect_result_release_bsses(wdev, cr); return; } memset(wdev->links, 0, sizeof(wdev->links)); for_each_valid_link(cr, link) { if (cr->links[link].status == WLAN_STATUS_SUCCESS) continue; cr->valid_links &= ~BIT(link); /* don't require bss pointer for failed links */ if (!cr->links[link].bss) continue; cfg80211_unhold_bss(bss_from_pub(cr->links[link].bss)); cfg80211_put_bss(wdev->wiphy, cr->links[link].bss); } wdev->valid_links = cr->valid_links; for_each_valid_link(cr, link) wdev->links[link].client.current_bss = bss_from_pub(cr->links[link].bss); wdev->connected = true; ether_addr_copy(wdev->u.client.connected_addr, connected_addr); if (cr->valid_links) { for_each_valid_link(cr, link) memcpy(wdev->links[link].addr, cr->links[link].addr, ETH_ALEN); } cfg80211_upload_connect_keys(wdev); rcu_read_lock(); for_each_valid_link(cr, link) { country_elem = ieee80211_bss_get_elem(cr->links[link].bss, WLAN_EID_COUNTRY); if (country_elem) break; } if (!country_elem) { rcu_read_unlock(); return; } country_datalen = country_elem->datalen; country_data = kmemdup(country_elem->data, country_datalen, GFP_ATOMIC); rcu_read_unlock(); if (!country_data) return; regulatory_hint_country_ie(wdev->wiphy, cr->links[link].bss->channel->band, country_data, country_datalen); kfree(country_data); if (!wdev->u.client.ssid_len) { rcu_read_lock(); for_each_valid_link(cr, link) { ssid = ieee80211_bss_get_elem(cr->links[link].bss, WLAN_EID_SSID); if (!ssid || !ssid->datalen) continue; memcpy(wdev->u.client.ssid, ssid->data, ssid->datalen); wdev->u.client.ssid_len = ssid->datalen; break; } rcu_read_unlock(); } return; out: for_each_valid_link(cr, link) cfg80211_put_bss(wdev->wiphy, cr->links[link].bss); } static void cfg80211_update_link_bss(struct wireless_dev *wdev, struct cfg80211_bss **bss) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_internal_bss *ibss; if (!*bss) return; ibss = bss_from_pub(*bss); if (list_empty(&ibss->list)) { struct cfg80211_bss *found = NULL, *tmp = *bss; found = cfg80211_get_bss(wdev->wiphy, NULL, (*bss)->bssid, wdev->u.client.ssid, wdev->u.client.ssid_len, wdev->conn_bss_type, IEEE80211_PRIVACY_ANY); if (found) { /* The same BSS is already updated so use it * instead, as it has latest info. */ *bss = found; } else { /* Update with BSS provided by driver, it will * be freshly added and ref cnted, we can free * the old one. * * signal_valid can be false, as we are not * expecting the BSS to be found. * * keep the old timestamp to avoid confusion */ cfg80211_bss_update(rdev, ibss, false, ibss->ts); } cfg80211_put_bss(wdev->wiphy, tmp); } } /* Consumes bss object(s) one way or another */ void cfg80211_connect_done(struct net_device *dev, struct cfg80211_connect_resp_params *params, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_event *ev; unsigned long flags; u8 *next; size_t link_info_size = 0; unsigned int link; for_each_valid_link(params, link) { cfg80211_update_link_bss(wdev, ¶ms->links[link].bss); link_info_size += params->links[link].bssid ? ETH_ALEN : 0; link_info_size += params->links[link].addr ? ETH_ALEN : 0; } ev = kzalloc(sizeof(*ev) + (params->ap_mld_addr ? ETH_ALEN : 0) + params->req_ie_len + params->resp_ie_len + params->fils.kek_len + params->fils.pmk_len + (params->fils.pmkid ? WLAN_PMKID_LEN : 0) + link_info_size, gfp); if (!ev) { for_each_valid_link(params, link) cfg80211_put_bss(wdev->wiphy, params->links[link].bss); return; } ev->type = EVENT_CONNECT_RESULT; next = ((u8 *)ev) + sizeof(*ev); if (params->ap_mld_addr) { ev->cr.ap_mld_addr = next; memcpy((void *)ev->cr.ap_mld_addr, params->ap_mld_addr, ETH_ALEN); next += ETH_ALEN; } if (params->req_ie_len) { ev->cr.req_ie = next; ev->cr.req_ie_len = params->req_ie_len; memcpy((void *)ev->cr.req_ie, params->req_ie, params->req_ie_len); next += params->req_ie_len; } if (params->resp_ie_len) { ev->cr.resp_ie = next; ev->cr.resp_ie_len = params->resp_ie_len; memcpy((void *)ev->cr.resp_ie, params->resp_ie, params->resp_ie_len); next += params->resp_ie_len; } if (params->fils.kek_len) { ev->cr.fils.kek = next; ev->cr.fils.kek_len = params->fils.kek_len; memcpy((void *)ev->cr.fils.kek, params->fils.kek, params->fils.kek_len); next += params->fils.kek_len; } if (params->fils.pmk_len) { ev->cr.fils.pmk = next; ev->cr.fils.pmk_len = params->fils.pmk_len; memcpy((void *)ev->cr.fils.pmk, params->fils.pmk, params->fils.pmk_len); next += params->fils.pmk_len; } if (params->fils.pmkid) { ev->cr.fils.pmkid = next; memcpy((void *)ev->cr.fils.pmkid, params->fils.pmkid, WLAN_PMKID_LEN); next += WLAN_PMKID_LEN; } ev->cr.fils.update_erp_next_seq_num = params->fils.update_erp_next_seq_num; if (params->fils.update_erp_next_seq_num) ev->cr.fils.erp_next_seq_num = params->fils.erp_next_seq_num; ev->cr.valid_links = params->valid_links; for_each_valid_link(params, link) { if (params->links[link].bss) cfg80211_hold_bss( bss_from_pub(params->links[link].bss)); ev->cr.links[link].bss = params->links[link].bss; ev->cr.links[link].status = params->links[link].status; if (params->links[link].addr) { ev->cr.links[link].addr = next; memcpy((void *)ev->cr.links[link].addr, params->links[link].addr, ETH_ALEN); next += ETH_ALEN; } if (params->links[link].bssid) { ev->cr.links[link].bssid = next; memcpy((void *)ev->cr.links[link].bssid, params->links[link].bssid, ETH_ALEN); next += ETH_ALEN; } } ev->cr.status = params->status; ev->cr.timeout_reason = params->timeout_reason; spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); spin_unlock_irqrestore(&wdev->event_lock, flags); queue_work(cfg80211_wq, &rdev->event_work); } EXPORT_SYMBOL(cfg80211_connect_done); /* Consumes bss object one way or another */ void __cfg80211_roamed(struct wireless_dev *wdev, struct cfg80211_roam_info *info) { #ifdef CONFIG_CFG80211_WEXT union iwreq_data wrqu; #endif unsigned int link; const u8 *connected_addr; lockdep_assert_wiphy(wdev->wiphy); if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION && wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)) goto out; if (WARN_ON(!wdev->connected)) goto out; if (info->valid_links) { if (WARN_ON(!info->ap_mld_addr)) goto out; for_each_valid_link(info, link) { if (WARN_ON(!info->links[link].addr)) goto out; } } cfg80211_wdev_release_bsses(wdev); for_each_valid_link(info, link) { if (WARN_ON(!info->links[link].bss)) goto out; } memset(wdev->links, 0, sizeof(wdev->links)); wdev->valid_links = info->valid_links; for_each_valid_link(info, link) { cfg80211_hold_bss(bss_from_pub(info->links[link].bss)); wdev->links[link].client.current_bss = bss_from_pub(info->links[link].bss); } connected_addr = info->valid_links ? info->ap_mld_addr : info->links[0].bss->bssid; ether_addr_copy(wdev->u.client.connected_addr, connected_addr); if (info->valid_links) { for_each_valid_link(info, link) memcpy(wdev->links[link].addr, info->links[link].addr, ETH_ALEN); } wdev->unprot_beacon_reported = 0; nl80211_send_roamed(wiphy_to_rdev(wdev->wiphy), wdev->netdev, info, GFP_KERNEL); #ifdef CONFIG_CFG80211_WEXT if (!info->valid_links) { if (info->req_ie) { memset(&wrqu, 0, sizeof(wrqu)); wrqu.data.length = info->req_ie_len; wireless_send_event(wdev->netdev, IWEVASSOCREQIE, &wrqu, info->req_ie); } if (info->resp_ie) { memset(&wrqu, 0, sizeof(wrqu)); wrqu.data.length = info->resp_ie_len; wireless_send_event(wdev->netdev, IWEVASSOCRESPIE, &wrqu, info->resp_ie); } memset(&wrqu, 0, sizeof(wrqu)); wrqu.ap_addr.sa_family = ARPHRD_ETHER; memcpy(wrqu.ap_addr.sa_data, connected_addr, ETH_ALEN); memcpy(wdev->wext.prev_bssid, connected_addr, ETH_ALEN); wdev->wext.prev_bssid_valid = true; wireless_send_event(wdev->netdev, SIOCGIWAP, &wrqu, NULL); } #endif return; out: for_each_valid_link(info, link) cfg80211_put_bss(wdev->wiphy, info->links[link].bss); } /* Consumes info->links.bss object(s) one way or another */ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_event *ev; unsigned long flags; u8 *next; unsigned int link; size_t link_info_size = 0; bool bss_not_found = false; for_each_valid_link(info, link) { link_info_size += info->links[link].addr ? ETH_ALEN : 0; link_info_size += info->links[link].bssid ? ETH_ALEN : 0; if (info->links[link].bss) continue; info->links[link].bss = cfg80211_get_bss(wdev->wiphy, info->links[link].channel, info->links[link].bssid, wdev->u.client.ssid, wdev->u.client.ssid_len, wdev->conn_bss_type, IEEE80211_PRIVACY_ANY); if (!info->links[link].bss) { bss_not_found = true; break; } } if (WARN_ON(bss_not_found)) goto out; ev = kzalloc(sizeof(*ev) + info->req_ie_len + info->resp_ie_len + info->fils.kek_len + info->fils.pmk_len + (info->fils.pmkid ? WLAN_PMKID_LEN : 0) + (info->ap_mld_addr ? ETH_ALEN : 0) + link_info_size, gfp); if (!ev) goto out; ev->type = EVENT_ROAMED; next = ((u8 *)ev) + sizeof(*ev); if (info->req_ie_len) { ev->rm.req_ie = next; ev->rm.req_ie_len = info->req_ie_len; memcpy((void *)ev->rm.req_ie, info->req_ie, info->req_ie_len); next += info->req_ie_len; } if (info->resp_ie_len) { ev->rm.resp_ie = next; ev->rm.resp_ie_len = info->resp_ie_len; memcpy((void *)ev->rm.resp_ie, info->resp_ie, info->resp_ie_len); next += info->resp_ie_len; } if (info->fils.kek_len) { ev->rm.fils.kek = next; ev->rm.fils.kek_len = info->fils.kek_len; memcpy((void *)ev->rm.fils.kek, info->fils.kek, info->fils.kek_len); next += info->fils.kek_len; } if (info->fils.pmk_len) { ev->rm.fils.pmk = next; ev->rm.fils.pmk_len = info->fils.pmk_len; memcpy((void *)ev->rm.fils.pmk, info->fils.pmk, info->fils.pmk_len); next += info->fils.pmk_len; } if (info->fils.pmkid) { ev->rm.fils.pmkid = next; memcpy((void *)ev->rm.fils.pmkid, info->fils.pmkid, WLAN_PMKID_LEN); next += WLAN_PMKID_LEN; } ev->rm.fils.update_erp_next_seq_num = info->fils.update_erp_next_seq_num; if (info->fils.update_erp_next_seq_num) ev->rm.fils.erp_next_seq_num = info->fils.erp_next_seq_num; if (info->ap_mld_addr) { ev->rm.ap_mld_addr = next; memcpy((void *)ev->rm.ap_mld_addr, info->ap_mld_addr, ETH_ALEN); next += ETH_ALEN; } ev->rm.valid_links = info->valid_links; for_each_valid_link(info, link) { ev->rm.links[link].bss = info->links[link].bss; if (info->links[link].addr) { ev->rm.links[link].addr = next; memcpy((void *)ev->rm.links[link].addr, info->links[link].addr, ETH_ALEN); next += ETH_ALEN; } if (info->links[link].bssid) { ev->rm.links[link].bssid = next; memcpy((void *)ev->rm.links[link].bssid, info->links[link].bssid, ETH_ALEN); next += ETH_ALEN; } } spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); spin_unlock_irqrestore(&wdev->event_lock, flags); queue_work(cfg80211_wq, &rdev->event_work); return; out: for_each_valid_link(info, link) cfg80211_put_bss(wdev->wiphy, info->links[link].bss); } EXPORT_SYMBOL(cfg80211_roamed); void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *peer_addr, const u8 *td_bitmap, u8 td_bitmap_len) { lockdep_assert_wiphy(wdev->wiphy); if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION && wdev->iftype != NL80211_IFTYPE_P2P_CLIENT && wdev->iftype != NL80211_IFTYPE_AP && wdev->iftype != NL80211_IFTYPE_P2P_GO)) return; if (wdev->iftype == NL80211_IFTYPE_STATION || wdev->iftype == NL80211_IFTYPE_P2P_CLIENT) { if (WARN_ON(!wdev->connected) || WARN_ON(!ether_addr_equal(wdev->u.client.connected_addr, peer_addr))) return; } nl80211_send_port_authorized(wiphy_to_rdev(wdev->wiphy), wdev->netdev, peer_addr, td_bitmap, td_bitmap_len); } void cfg80211_port_authorized(struct net_device *dev, const u8 *peer_addr, const u8 *td_bitmap, u8 td_bitmap_len, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_event *ev; unsigned long flags; if (WARN_ON(!peer_addr)) return; ev = kzalloc(sizeof(*ev) + td_bitmap_len, gfp); if (!ev) return; ev->type = EVENT_PORT_AUTHORIZED; memcpy(ev->pa.peer_addr, peer_addr, ETH_ALEN); ev->pa.td_bitmap = ((u8 *)ev) + sizeof(*ev); ev->pa.td_bitmap_len = td_bitmap_len; memcpy((void *)ev->pa.td_bitmap, td_bitmap, td_bitmap_len); /* * Use the wdev event list so that if there are pending * connected/roamed events, they will be reported first. */ spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); spin_unlock_irqrestore(&wdev->event_lock, flags); queue_work(cfg80211_wq, &rdev->event_work); } EXPORT_SYMBOL(cfg80211_port_authorized); void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, size_t ie_len, u16 reason, bool from_ap) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int i; #ifdef CONFIG_CFG80211_WEXT union iwreq_data wrqu; #endif lockdep_assert_wiphy(wdev->wiphy); if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION && wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)) return; cfg80211_wdev_release_bsses(wdev); wdev->valid_links = 0; wdev->connected = false; wdev->u.client.ssid_len = 0; wdev->conn_owner_nlportid = 0; kfree_sensitive(wdev->connect_keys); wdev->connect_keys = NULL; nl80211_send_disconnected(rdev, dev, reason, ie, ie_len, from_ap); /* stop critical protocol if supported */ if (rdev->ops->crit_proto_stop && rdev->crit_proto_nlportid) { rdev->crit_proto_nlportid = 0; rdev_crit_proto_stop(rdev, wdev); } /* * Delete all the keys ... pairwise keys can't really * exist any more anyway, but default keys might. */ if (rdev->ops->del_key) { int max_key_idx = 5; if (wiphy_ext_feature_isset( wdev->wiphy, NL80211_EXT_FEATURE_BEACON_PROTECTION) || wiphy_ext_feature_isset( wdev->wiphy, NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT)) max_key_idx = 7; for (i = 0; i <= max_key_idx; i++) rdev_del_key(rdev, dev, -1, i, false, NULL); } rdev_set_qos_map(rdev, dev, NULL); #ifdef CONFIG_CFG80211_WEXT memset(&wrqu, 0, sizeof(wrqu)); wrqu.ap_addr.sa_family = ARPHRD_ETHER; wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL); wdev->wext.connect.ssid_len = 0; #endif schedule_work(&cfg80211_disconnect_work); cfg80211_schedule_channels_check(wdev); } void cfg80211_disconnected(struct net_device *dev, u16 reason, const u8 *ie, size_t ie_len, bool locally_generated, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_event *ev; unsigned long flags; ev = kzalloc(sizeof(*ev) + ie_len, gfp); if (!ev) return; ev->type = EVENT_DISCONNECTED; ev->dc.ie = ((u8 *)ev) + sizeof(*ev); ev->dc.ie_len = ie_len; memcpy((void *)ev->dc.ie, ie, ie_len); ev->dc.reason = reason; ev->dc.locally_generated = locally_generated; spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); spin_unlock_irqrestore(&wdev->event_lock, flags); queue_work(cfg80211_wq, &rdev->event_work); } EXPORT_SYMBOL(cfg80211_disconnected); /* * API calls for nl80211/wext compatibility code */ int cfg80211_connect(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_connect_params *connect, struct cfg80211_cached_keys *connkeys, const u8 *prev_bssid) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err; lockdep_assert_wiphy(wdev->wiphy); /* * If we have an ssid_len, we're trying to connect or are * already connected, so reject a new SSID unless it's the * same (which is the case for re-association.) */ if (wdev->u.client.ssid_len && (wdev->u.client.ssid_len != connect->ssid_len || memcmp(wdev->u.client.ssid, connect->ssid, wdev->u.client.ssid_len))) return -EALREADY; /* * If connected, reject (re-)association unless prev_bssid * matches the current BSSID. */ if (wdev->connected) { if (!prev_bssid) return -EALREADY; if (!ether_addr_equal(prev_bssid, wdev->u.client.connected_addr)) return -ENOTCONN; } /* * Reject if we're in the process of connecting with WEP, * this case isn't very interesting and trying to handle * it would make the code much more complex. */ if (wdev->connect_keys) return -EINPROGRESS; cfg80211_oper_and_ht_capa(&connect->ht_capa_mask, rdev->wiphy.ht_capa_mod_mask); cfg80211_oper_and_vht_capa(&connect->vht_capa_mask, rdev->wiphy.vht_capa_mod_mask); if (connkeys && connkeys->def >= 0) { int idx; u32 cipher; idx = connkeys->def; cipher = connkeys->params[idx].cipher; /* If given a WEP key we may need it for shared key auth */ if (cipher == WLAN_CIPHER_SUITE_WEP40 || cipher == WLAN_CIPHER_SUITE_WEP104) { connect->key_idx = idx; connect->key = connkeys->params[idx].key; connect->key_len = connkeys->params[idx].key_len; /* * If ciphers are not set (e.g. when going through * iwconfig), we have to set them appropriately here. */ if (connect->crypto.cipher_group == 0) connect->crypto.cipher_group = cipher; if (connect->crypto.n_ciphers_pairwise == 0) { connect->crypto.n_ciphers_pairwise = 1; connect->crypto.ciphers_pairwise[0] = cipher; } } } else { if (WARN_ON(connkeys)) return -EINVAL; /* connect can point to wdev->wext.connect which * can hold key data from a previous connection */ connect->key = NULL; connect->key_len = 0; connect->key_idx = 0; } wdev->connect_keys = connkeys; memcpy(wdev->u.client.ssid, connect->ssid, connect->ssid_len); wdev->u.client.ssid_len = connect->ssid_len; wdev->conn_bss_type = connect->pbss ? IEEE80211_BSS_TYPE_PBSS : IEEE80211_BSS_TYPE_ESS; if (!rdev->ops->connect) err = cfg80211_sme_connect(wdev, connect, prev_bssid); else err = rdev_connect(rdev, dev, connect); if (err) { wdev->connect_keys = NULL; /* * This could be reassoc getting refused, don't clear * ssid_len in that case. */ if (!wdev->connected) wdev->u.client.ssid_len = 0; return err; } return 0; } int cfg80211_disconnect(struct cfg80211_registered_device *rdev, struct net_device *dev, u16 reason, bool wextev) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err = 0; lockdep_assert_wiphy(wdev->wiphy); kfree_sensitive(wdev->connect_keys); wdev->connect_keys = NULL; wdev->conn_owner_nlportid = 0; if (wdev->conn) err = cfg80211_sme_disconnect(wdev, reason); else if (!rdev->ops->disconnect) cfg80211_mlme_down(rdev, dev); else if (wdev->u.client.ssid_len) err = rdev_disconnect(rdev, dev, reason); /* * Clear ssid_len unless we actually were fully connected, * in which case cfg80211_disconnected() will take care of * this later. */ if (!wdev->connected) wdev->u.client.ssid_len = 0; return err; } /* * Used to clean up after the connection / connection attempt owner socket * disconnects */ void cfg80211_autodisconnect_wk(struct work_struct *work) { struct wireless_dev *wdev = container_of(work, struct wireless_dev, disconnect_wk); struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); wiphy_lock(wdev->wiphy); if (wdev->conn_owner_nlportid) { switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: cfg80211_leave_ibss(rdev, wdev->netdev, false); break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: cfg80211_stop_ap(rdev, wdev->netdev, -1, false); break; case NL80211_IFTYPE_MESH_POINT: cfg80211_leave_mesh(rdev, wdev->netdev); break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: /* * Use disconnect_bssid if still connecting and * ops->disconnect not implemented. Otherwise we can * use cfg80211_disconnect. */ if (rdev->ops->disconnect || wdev->connected) cfg80211_disconnect(rdev, wdev->netdev, WLAN_REASON_DEAUTH_LEAVING, true); else cfg80211_mlme_deauth(rdev, wdev->netdev, wdev->disconnect_bssid, NULL, 0, WLAN_REASON_DEAUTH_LEAVING, false); break; default: break; } } wiphy_unlock(wdev->wiphy); } |
| 60 61 29 8 77 77 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 | /* SPDX-License-Identifier: GPL-2.0-only */ #undef TRACE_SYSTEM #define TRACE_SYSTEM erofs #if !defined(_TRACE_EROFS_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_EROFS_H #include <linux/tracepoint.h> #include <linux/fs.h> struct erofs_map_blocks; #define show_dev(dev) MAJOR(dev), MINOR(dev) #define show_dev_nid(entry) show_dev(entry->dev), entry->nid #define show_file_type(type) \ __print_symbolic(type, \ { 0, "FILE" }, \ { 1, "DIR" }) #define show_map_flags(flags) __print_flags(flags, "|", \ { EROFS_GET_BLOCKS_FIEMAP, "FIEMAP" }, \ { EROFS_GET_BLOCKS_READMORE, "READMORE" }, \ { EROFS_GET_BLOCKS_FINDTAIL, "FINDTAIL" }) #define show_mflags(flags) __print_flags(flags, "", \ { EROFS_MAP_MAPPED, "M" }, \ { EROFS_MAP_META, "I" }, \ { EROFS_MAP_ENCODED, "E" }, \ { EROFS_MAP_FULL_MAPPED, "F" }, \ { EROFS_MAP_FRAGMENT, "R" }, \ { EROFS_MAP_PARTIAL_REF, "P" }) TRACE_EVENT(erofs_lookup, TP_PROTO(struct inode *dir, struct dentry *dentry, unsigned int flags), TP_ARGS(dir, dentry, flags), TP_STRUCT__entry( __field(dev_t, dev ) __field(erofs_nid_t, nid ) __string(name, dentry->d_name.name ) __field(unsigned int, flags ) ), TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->nid = EROFS_I(dir)->nid; __assign_str(name); __entry->flags = flags; ), TP_printk("dev = (%d,%d), pnid = %llu, name:%s, flags:%x", show_dev_nid(__entry), __get_str(name), __entry->flags) ); TRACE_EVENT(erofs_fill_inode, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field(dev_t, dev ) __field(erofs_nid_t, nid ) __field(erofs_blk_t, blkaddr ) __field(unsigned int, ofs ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->nid = EROFS_I(inode)->nid; __entry->blkaddr = erofs_blknr(inode->i_sb, erofs_iloc(inode)); __entry->ofs = erofs_blkoff(inode->i_sb, erofs_iloc(inode)); ), TP_printk("dev = (%d,%d), nid = %llu, blkaddr %u ofs %u", show_dev_nid(__entry), __entry->blkaddr, __entry->ofs) ); TRACE_EVENT(erofs_read_folio, TP_PROTO(struct folio *folio, bool raw), TP_ARGS(folio, raw), TP_STRUCT__entry( __field(dev_t, dev ) __field(erofs_nid_t, nid ) __field(int, dir ) __field(pgoff_t, index ) __field(int, uptodate) __field(bool, raw ) ), TP_fast_assign( __entry->dev = folio->mapping->host->i_sb->s_dev; __entry->nid = EROFS_I(folio->mapping->host)->nid; __entry->dir = S_ISDIR(folio->mapping->host->i_mode); __entry->index = folio->index; __entry->uptodate = folio_test_uptodate(folio); __entry->raw = raw; ), TP_printk("dev = (%d,%d), nid = %llu, %s, index = %lu, uptodate = %d " "raw = %d", show_dev_nid(__entry), show_file_type(__entry->dir), (unsigned long)__entry->index, __entry->uptodate, __entry->raw) ); TRACE_EVENT(erofs_readpages, TP_PROTO(struct inode *inode, pgoff_t start, unsigned int nrpage, bool raw), TP_ARGS(inode, start, nrpage, raw), TP_STRUCT__entry( __field(dev_t, dev ) __field(erofs_nid_t, nid ) __field(pgoff_t, start ) __field(unsigned int, nrpage ) __field(bool, raw ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->nid = EROFS_I(inode)->nid; __entry->start = start; __entry->nrpage = nrpage; __entry->raw = raw; ), TP_printk("dev = (%d,%d), nid = %llu, start = %lu nrpage = %u raw = %d", show_dev_nid(__entry), (unsigned long)__entry->start, __entry->nrpage, __entry->raw) ); TRACE_EVENT(erofs_map_blocks_enter, TP_PROTO(struct inode *inode, struct erofs_map_blocks *map, unsigned int flags), TP_ARGS(inode, map, flags), TP_STRUCT__entry( __field( dev_t, dev ) __field( erofs_nid_t, nid ) __field( erofs_off_t, la ) __field( u64, llen ) __field( unsigned int, flags ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->nid = EROFS_I(inode)->nid; __entry->la = map->m_la; __entry->llen = map->m_llen; __entry->flags = flags; ), TP_printk("dev = (%d,%d), nid = %llu, la %llu llen %llu flags %s", show_dev_nid(__entry), __entry->la, __entry->llen, __entry->flags ? show_map_flags(__entry->flags) : "NULL") ); TRACE_EVENT(erofs_map_blocks_exit, TP_PROTO(struct inode *inode, struct erofs_map_blocks *map, unsigned int flags, int ret), TP_ARGS(inode, map, flags, ret), TP_STRUCT__entry( __field( dev_t, dev ) __field( erofs_nid_t, nid ) __field( unsigned int, flags ) __field( erofs_off_t, la ) __field( erofs_off_t, pa ) __field( u64, llen ) __field( u64, plen ) __field( unsigned int, mflags ) __field( int, ret ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->nid = EROFS_I(inode)->nid; __entry->flags = flags; __entry->la = map->m_la; __entry->pa = map->m_pa; __entry->llen = map->m_llen; __entry->plen = map->m_plen; __entry->mflags = map->m_flags; __entry->ret = ret; ), TP_printk("dev = (%d,%d), nid = %llu, flags %s " "la %llu pa %llu llen %llu plen %llu mflags %s ret %d", show_dev_nid(__entry), __entry->flags ? show_map_flags(__entry->flags) : "NULL", __entry->la, __entry->pa, __entry->llen, __entry->plen, show_mflags(__entry->mflags), __entry->ret) ); TRACE_EVENT(erofs_destroy_inode, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( erofs_nid_t, nid ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->nid = EROFS_I(inode)->nid; ), TP_printk("dev = (%d,%d), nid = %llu", show_dev_nid(__entry)) ); #endif /* _TRACE_EROFS_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 141 183 3 13 209 214 224 11 5 2 3 2 2 63 100 103 180 17 220 69 220 169 4 1 220 1 220 220 131 131 199 140 140 140 139 140 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_BKEY_H #define _BCACHEFS_BKEY_H #include <linux/bug.h> #include "bcachefs_format.h" #include "bkey_types.h" #include "btree_types.h" #include "util.h" #include "vstructs.h" enum bch_validate_flags { BCH_VALIDATE_write = BIT(0), BCH_VALIDATE_commit = BIT(1), BCH_VALIDATE_journal = BIT(2), BCH_VALIDATE_silent = BIT(3), }; #if 0 /* * compiled unpack functions are disabled, pending a new interface for * dynamically allocating executable memory: */ #ifdef CONFIG_X86_64 #define HAVE_BCACHEFS_COMPILED_UNPACK 1 #endif #endif void bch2_bkey_packed_to_binary_text(struct printbuf *, const struct bkey_format *, const struct bkey_packed *); enum bkey_lr_packed { BKEY_PACKED_BOTH, BKEY_PACKED_RIGHT, BKEY_PACKED_LEFT, BKEY_PACKED_NONE, }; #define bkey_lr_packed(_l, _r) \ ((_l)->format + ((_r)->format << 1)) static inline void bkey_p_copy(struct bkey_packed *dst, const struct bkey_packed *src) { memcpy_u64s_small(dst, src, src->u64s); } static inline void bkey_copy(struct bkey_i *dst, const struct bkey_i *src) { memcpy_u64s_small(dst, src, src->k.u64s); } struct btree; __pure unsigned bch2_bkey_greatest_differing_bit(const struct btree *, const struct bkey_packed *, const struct bkey_packed *); __pure unsigned bch2_bkey_ffs(const struct btree *, const struct bkey_packed *); __pure int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *, const struct bkey_packed *, const struct btree *); __pure int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *, const struct bkey_packed *, const struct bpos *); __pure int bch2_bkey_cmp_packed(const struct btree *, const struct bkey_packed *, const struct bkey_packed *); __pure int __bch2_bkey_cmp_left_packed(const struct btree *, const struct bkey_packed *, const struct bpos *); static inline __pure int bkey_cmp_left_packed(const struct btree *b, const struct bkey_packed *l, const struct bpos *r) { return __bch2_bkey_cmp_left_packed(b, l, r); } /* * The compiler generates better code when we pass bpos by ref, but it's often * enough terribly convenient to pass it by val... as much as I hate c++, const * ref would be nice here: */ __pure __flatten static inline int bkey_cmp_left_packed_byval(const struct btree *b, const struct bkey_packed *l, struct bpos r) { return bkey_cmp_left_packed(b, l, &r); } static __always_inline bool bpos_eq(struct bpos l, struct bpos r) { return !((l.inode ^ r.inode) | (l.offset ^ r.offset) | (l.snapshot ^ r.snapshot)); } static __always_inline bool bpos_lt(struct bpos l, struct bpos r) { return l.inode != r.inode ? l.inode < r.inode : l.offset != r.offset ? l.offset < r.offset : l.snapshot != r.snapshot ? l.snapshot < r.snapshot : false; } static __always_inline bool bpos_le(struct bpos l, struct bpos r) { return l.inode != r.inode ? l.inode < r.inode : l.offset != r.offset ? l.offset < r.offset : l.snapshot != r.snapshot ? l.snapshot < r.snapshot : true; } static __always_inline bool bpos_gt(struct bpos l, struct bpos r) { return bpos_lt(r, l); } static __always_inline bool bpos_ge(struct bpos l, struct bpos r) { return bpos_le(r, l); } static __always_inline int bpos_cmp(struct bpos l, struct bpos r) { return cmp_int(l.inode, r.inode) ?: cmp_int(l.offset, r.offset) ?: cmp_int(l.snapshot, r.snapshot); } static inline struct bpos bpos_min(struct bpos l, struct bpos r) { return bpos_lt(l, r) ? l : r; } static inline struct bpos bpos_max(struct bpos l, struct bpos r) { return bpos_gt(l, r) ? l : r; } static __always_inline bool bkey_eq(struct bpos l, struct bpos r) { return !((l.inode ^ r.inode) | (l.offset ^ r.offset)); } static __always_inline bool bkey_lt(struct bpos l, struct bpos r) { return l.inode != r.inode ? l.inode < r.inode : l.offset < r.offset; } static __always_inline bool bkey_le(struct bpos l, struct bpos r) { return l.inode != r.inode ? l.inode < r.inode : l.offset <= r.offset; } static __always_inline bool bkey_gt(struct bpos l, struct bpos r) { return bkey_lt(r, l); } static __always_inline bool bkey_ge(struct bpos l, struct bpos r) { return bkey_le(r, l); } static __always_inline int bkey_cmp(struct bpos l, struct bpos r) { return cmp_int(l.inode, r.inode) ?: cmp_int(l.offset, r.offset); } static inline struct bpos bkey_min(struct bpos l, struct bpos r) { return bkey_lt(l, r) ? l : r; } static inline struct bpos bkey_max(struct bpos l, struct bpos r) { return bkey_gt(l, r) ? l : r; } static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r) { return bpos_eq(l.k->p, r.k->p) && bkey_bytes(l.k) == bkey_bytes(r.k) && !memcmp(l.v, r.v, bkey_val_bytes(l.k)); } void bch2_bpos_swab(struct bpos *); void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); static __always_inline int bversion_cmp(struct bversion l, struct bversion r) { return cmp_int(l.hi, r.hi) ?: cmp_int(l.lo, r.lo); } #define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 }) #define MAX_VERSION ((struct bversion) { .hi = ~0, .lo = ~0ULL }) static __always_inline bool bversion_zero(struct bversion v) { return bversion_cmp(v, ZERO_VERSION) == 0; } #ifdef CONFIG_BCACHEFS_DEBUG /* statement expressions confusing unlikely()? */ #define bkey_packed(_k) \ ({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT); \ (_k)->format != KEY_FORMAT_CURRENT; }) #else #define bkey_packed(_k) ((_k)->format != KEY_FORMAT_CURRENT) #endif /* * It's safe to treat an unpacked bkey as a packed one, but not the reverse */ static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k) { return (struct bkey_packed *) k; } static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k) { return (const struct bkey_packed *) k; } static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k) { return bkey_packed(k) ? NULL : (struct bkey_i *) k; } static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k) { return bkey_packed(k) ? NULL : (const struct bkey *) k; } static inline unsigned bkey_format_key_bits(const struct bkey_format *format) { return format->bits_per_field[BKEY_FIELD_INODE] + format->bits_per_field[BKEY_FIELD_OFFSET] + format->bits_per_field[BKEY_FIELD_SNAPSHOT]; } static inline struct bpos bpos_successor(struct bpos p) { if (!++p.snapshot && !++p.offset && !++p.inode) BUG(); return p; } static inline struct bpos bpos_predecessor(struct bpos p) { if (!p.snapshot-- && !p.offset-- && !p.inode--) BUG(); return p; } static inline struct bpos bpos_nosnap_successor(struct bpos p) { p.snapshot = 0; if (!++p.offset && !++p.inode) BUG(); return p; } static inline struct bpos bpos_nosnap_predecessor(struct bpos p) { p.snapshot = 0; if (!p.offset-- && !p.inode--) BUG(); return p; } static inline u64 bkey_start_offset(const struct bkey *k) { return k->p.offset - k->size; } static inline struct bpos bkey_start_pos(const struct bkey *k) { return (struct bpos) { .inode = k->p.inode, .offset = bkey_start_offset(k), .snapshot = k->p.snapshot, }; } /* Packed helpers */ static inline unsigned bkeyp_key_u64s(const struct bkey_format *format, const struct bkey_packed *k) { return bkey_packed(k) ? format->key_u64s : BKEY_U64s; } static inline bool bkeyp_u64s_valid(const struct bkey_format *f, const struct bkey_packed *k) { return ((unsigned) k->u64s - bkeyp_key_u64s(f, k) <= U8_MAX - BKEY_U64s); } static inline unsigned bkeyp_key_bytes(const struct bkey_format *format, const struct bkey_packed *k) { return bkeyp_key_u64s(format, k) * sizeof(u64); } static inline unsigned bkeyp_val_u64s(const struct bkey_format *format, const struct bkey_packed *k) { return k->u64s - bkeyp_key_u64s(format, k); } static inline size_t bkeyp_val_bytes(const struct bkey_format *format, const struct bkey_packed *k) { return bkeyp_val_u64s(format, k) * sizeof(u64); } static inline void set_bkeyp_val_u64s(const struct bkey_format *format, struct bkey_packed *k, unsigned val_u64s) { k->u64s = bkeyp_key_u64s(format, k) + val_u64s; } #define bkeyp_val(_format, _k) \ ((struct bch_val *) ((u64 *) (_k)->_data + bkeyp_key_u64s(_format, _k))) extern const struct bkey_format bch2_bkey_format_current; bool bch2_bkey_transform(const struct bkey_format *, struct bkey_packed *, const struct bkey_format *, const struct bkey_packed *); struct bkey __bch2_bkey_unpack_key(const struct bkey_format *, const struct bkey_packed *); #ifndef HAVE_BCACHEFS_COMPILED_UNPACK struct bpos __bkey_unpack_pos(const struct bkey_format *, const struct bkey_packed *); #endif bool bch2_bkey_pack_key(struct bkey_packed *, const struct bkey *, const struct bkey_format *); enum bkey_pack_pos_ret { BKEY_PACK_POS_EXACT, BKEY_PACK_POS_SMALLER, BKEY_PACK_POS_FAIL, }; enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *, struct bpos, const struct btree *); static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in, const struct btree *b) { return bch2_bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT; } void bch2_bkey_unpack(const struct btree *, struct bkey_i *, const struct bkey_packed *); bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *, const struct bkey_format *); typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *); static inline void __bkey_unpack_key_format_checked(const struct btree *b, struct bkey *dst, const struct bkey_packed *src) { if (IS_ENABLED(HAVE_BCACHEFS_COMPILED_UNPACK)) { compiled_unpack_fn unpack_fn = b->aux_data; unpack_fn(dst, src); if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && bch2_expensive_debug_checks) { struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); } } else { *dst = __bch2_bkey_unpack_key(&b->format, src); } } static inline struct bkey bkey_unpack_key_format_checked(const struct btree *b, const struct bkey_packed *src) { struct bkey dst; __bkey_unpack_key_format_checked(b, &dst, src); return dst; } static inline void __bkey_unpack_key(const struct btree *b, struct bkey *dst, const struct bkey_packed *src) { if (likely(bkey_packed(src))) __bkey_unpack_key_format_checked(b, dst, src); else *dst = *packed_to_bkey_c(src); } /** * bkey_unpack_key -- unpack just the key, not the value */ static inline struct bkey bkey_unpack_key(const struct btree *b, const struct bkey_packed *src) { return likely(bkey_packed(src)) ? bkey_unpack_key_format_checked(b, src) : *packed_to_bkey_c(src); } static inline struct bpos bkey_unpack_pos_format_checked(const struct btree *b, const struct bkey_packed *src) { #ifdef HAVE_BCACHEFS_COMPILED_UNPACK return bkey_unpack_key_format_checked(b, src).p; #else return __bkey_unpack_pos(&b->format, src); #endif } static inline struct bpos bkey_unpack_pos(const struct btree *b, const struct bkey_packed *src) { return likely(bkey_packed(src)) ? bkey_unpack_pos_format_checked(b, src) : packed_to_bkey_c(src)->p; } /* Disassembled bkeys */ static inline struct bkey_s_c bkey_disassemble(const struct btree *b, const struct bkey_packed *k, struct bkey *u) { __bkey_unpack_key(b, u, k); return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), }; } /* non const version: */ static inline struct bkey_s __bkey_disassemble(const struct btree *b, struct bkey_packed *k, struct bkey *u) { __bkey_unpack_key(b, u, k); return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), }; } static inline u64 bkey_field_max(const struct bkey_format *f, enum bch_bkey_fields nr) { return f->bits_per_field[nr] < 64 ? (le64_to_cpu(f->field_offset[nr]) + ~(~0ULL << f->bits_per_field[nr])) : U64_MAX; } #ifdef HAVE_BCACHEFS_COMPILED_UNPACK int bch2_compile_bkey_format(const struct bkey_format *, void *); #else static inline int bch2_compile_bkey_format(const struct bkey_format *format, void *out) { return 0; } #endif static inline void bkey_reassemble(struct bkey_i *dst, struct bkey_s_c src) { dst->k = *src.k; memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k)); } /* byte order helpers */ #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ static inline unsigned high_word_offset(const struct bkey_format *f) { return f->key_u64s - 1; } #define high_bit_offset 0 #define nth_word(p, n) ((p) - (n)) #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ static inline unsigned high_word_offset(const struct bkey_format *f) { return 0; } #define high_bit_offset KEY_PACKED_BITS_START #define nth_word(p, n) ((p) + (n)) #else #error edit for your odd byteorder. #endif #define high_word(f, k) ((u64 *) (k)->_data + high_word_offset(f)) #define next_word(p) nth_word(p, 1) #define prev_word(p) nth_word(p, -1) #ifdef CONFIG_BCACHEFS_DEBUG void bch2_bkey_pack_test(void); #else static inline void bch2_bkey_pack_test(void) {} #endif #define bkey_fields() \ x(BKEY_FIELD_INODE, p.inode) \ x(BKEY_FIELD_OFFSET, p.offset) \ x(BKEY_FIELD_SNAPSHOT, p.snapshot) \ x(BKEY_FIELD_SIZE, size) \ x(BKEY_FIELD_VERSION_HI, bversion.hi) \ x(BKEY_FIELD_VERSION_LO, bversion.lo) struct bkey_format_state { u64 field_min[BKEY_NR_FIELDS]; u64 field_max[BKEY_NR_FIELDS]; }; void bch2_bkey_format_init(struct bkey_format_state *); static inline void __bkey_format_add(struct bkey_format_state *s, unsigned field, u64 v) { s->field_min[field] = min(s->field_min[field], v); s->field_max[field] = max(s->field_max[field], v); } /* * Changes @format so that @k can be successfully packed with @format */ static inline void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k) { #define x(id, field) __bkey_format_add(s, id, k->field); bkey_fields() #undef x } void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos); struct bkey_format bch2_bkey_format_done(struct bkey_format_state *); static inline bool bch2_bkey_format_field_overflows(struct bkey_format *f, unsigned i) { unsigned f_bits = f->bits_per_field[i]; unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1)); u64 field_offset = le64_to_cpu(f->field_offset[i]); if (f_bits > unpacked_bits) return true; if ((f_bits == unpacked_bits) && field_offset) return true; u64 f_mask = f_bits ? ~((~0ULL << (f_bits - 1)) << 1) : 0; if (((field_offset + f_mask) & unpacked_mask) < field_offset) return true; return false; } int bch2_bkey_format_invalid(struct bch_fs *, struct bkey_format *, enum bch_validate_flags, struct printbuf *); void bch2_bkey_format_to_text(struct printbuf *, const struct bkey_format *); #endif /* _BCACHEFS_BKEY_H */ |
| 219 8 226 12 219 218 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2008 Oracle. All rights reserved. */ #ifndef BTRFS_DELAYED_REF_H #define BTRFS_DELAYED_REF_H #include <linux/types.h> #include <linux/refcount.h> #include <linux/list.h> #include <linux/rbtree.h> #include <linux/mutex.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <uapi/linux/btrfs_tree.h> struct btrfs_trans_handle; struct btrfs_fs_info; /* these are the possible values of struct btrfs_delayed_ref_node->action */ enum btrfs_delayed_ref_action { /* Add one backref to the tree */ BTRFS_ADD_DELAYED_REF = 1, /* Delete one backref from the tree */ BTRFS_DROP_DELAYED_REF, /* Record a full extent allocation */ BTRFS_ADD_DELAYED_EXTENT, /* Not changing ref count on head ref */ BTRFS_UPDATE_DELAYED_HEAD, } __packed; struct btrfs_data_ref { /* For EXTENT_DATA_REF */ /* Inode which refers to this data extent */ u64 objectid; /* * file_offset - extent_offset * * file_offset is the key.offset of the EXTENT_DATA key. * extent_offset is btrfs_file_extent_offset() of the EXTENT_DATA data. */ u64 offset; }; struct btrfs_tree_ref { /* * Level of this tree block. * * Shared for skinny (TREE_BLOCK_REF) and normal tree ref. */ int level; /* For non-skinny metadata, no special member needed */ }; struct btrfs_delayed_ref_node { struct rb_node ref_node; /* * If action is BTRFS_ADD_DELAYED_REF, also link this node to * ref_head->ref_add_list, then we do not need to iterate the * refs rbtree in the corresponding delayed ref head * (struct btrfs_delayed_ref_head::ref_tree). */ struct list_head add_list; /* the starting bytenr of the extent */ u64 bytenr; /* the size of the extent */ u64 num_bytes; /* seq number to keep track of insertion order */ u64 seq; /* The ref_root for this ref */ u64 ref_root; /* * The parent for this ref, if this isn't set the ref_root is the * reference owner. */ u64 parent; /* ref count on this data structure */ refcount_t refs; /* * how many refs is this entry adding or deleting. For * head refs, this may be a negative number because it is keeping * track of the total mods done to the reference count. * For individual refs, this will always be a positive number * * It may be more than one, since it is possible for a single * parent to have more than one ref on an extent */ int ref_mod; unsigned int action:8; unsigned int type:8; union { struct btrfs_tree_ref tree_ref; struct btrfs_data_ref data_ref; }; }; struct btrfs_delayed_extent_op { struct btrfs_disk_key key; bool update_key; bool update_flags; u64 flags_to_set; }; /* * the head refs are used to hold a lock on a given extent, which allows us * to make sure that only one process is running the delayed refs * at a time for a single extent. They also store the sum of all the * reference count modifications we've queued up. */ struct btrfs_delayed_ref_head { u64 bytenr; u64 num_bytes; /* * the mutex is held while running the refs, and it is also * held when checking the sum of reference modifications. */ struct mutex mutex; refcount_t refs; /* Protects 'ref_tree' and 'ref_add_list'. */ spinlock_t lock; struct rb_root_cached ref_tree; /* accumulate add BTRFS_ADD_DELAYED_REF nodes to this ref_add_list. */ struct list_head ref_add_list; struct btrfs_delayed_extent_op *extent_op; /* * This is used to track the final ref_mod from all the refs associated * with this head ref, this is not adjusted as delayed refs are run, * this is meant to track if we need to do the csum accounting or not. */ int total_ref_mod; /* * This is the current outstanding mod references for this bytenr. This * is used with lookup_extent_info to get an accurate reference count * for a bytenr, so it is adjusted as delayed refs are run so that any * on disk reference count + ref_mod is accurate. */ int ref_mod; /* * The root that triggered the allocation when must_insert_reserved is * set to true. */ u64 owning_root; /* * Track reserved bytes when setting must_insert_reserved. On success * or cleanup, we will need to free the reservation. */ u64 reserved_bytes; /* Tree block level, for metadata only. */ u8 level; /* * when a new extent is allocated, it is just reserved in memory * The actual extent isn't inserted into the extent allocation tree * until the delayed ref is processed. must_insert_reserved is * used to flag a delayed ref so the accounting can be updated * when a full insert is done. * * It is possible the extent will be freed before it is ever * inserted into the extent allocation tree. In this case * we need to update the in ram accounting to properly reflect * the free has happened. */ bool must_insert_reserved; bool is_data; bool is_system; bool processing; /* * Indicate if it's currently in the data structure that tracks head * refs (struct btrfs_delayed_ref_root::head_refs). */ bool tracked; }; enum btrfs_delayed_ref_flags { /* Indicate that we are flushing delayed refs for the commit */ BTRFS_DELAYED_REFS_FLUSHING, }; struct btrfs_delayed_ref_root { /* * Track head references. * The keys correspond to the logical address of the extent ("bytenr") * right shifted by fs_info->sectorsize_bits. This is both to get a more * dense index space (optimizes xarray structure) and because indexes in * xarrays are of "unsigned long" type, meaning they are 32 bits wide on * 32 bits platforms, limiting the extent range to 4G which is too low * and makes it unusable (truncated index values) on 32 bits platforms. * Protected by the spinlock 'lock' defined below. */ struct xarray head_refs; /* * Track dirty extent records. * The keys correspond to the logical address of the extent ("bytenr") * right shifted by fs_info->sectorsize_bits, for same reasons as above. */ struct xarray dirty_extents; /* * Protects the xarray head_refs, its entries and the following fields: * num_heads, num_heads_ready, pending_csums and run_delayed_start. */ spinlock_t lock; /* Total number of head refs, protected by the spinlock 'lock'. */ unsigned long num_heads; /* * Total number of head refs ready for processing, protected by the * spinlock 'lock'. */ unsigned long num_heads_ready; /* * Track space reserved for deleting csums of data extents. * Protected by the spinlock 'lock'. */ u64 pending_csums; unsigned long flags; /* * Track from which bytenr to start searching ref heads. * Protected by the spinlock 'lock'. */ u64 run_delayed_start; /* * To make qgroup to skip given root. * This is for snapshot, as btrfs_qgroup_inherit() will manually * modify counters for snapshot and its source, so we should skip * the snapshot in new_root/old_roots or it will get calculated twice */ u64 qgroup_to_skip; }; enum btrfs_ref_type { BTRFS_REF_NOT_SET, BTRFS_REF_DATA, BTRFS_REF_METADATA, BTRFS_REF_LAST, } __packed; struct btrfs_ref { enum btrfs_ref_type type; enum btrfs_delayed_ref_action action; /* * Whether this extent should go through qgroup record. * * Normally false, but for certain cases like delayed subtree scan, * setting this flag can hugely reduce qgroup overhead. */ bool skip_qgroup; #ifdef CONFIG_BTRFS_FS_REF_VERIFY /* Through which root is this modification. */ u64 real_root; #endif u64 bytenr; u64 num_bytes; u64 owning_root; /* * The root that owns the reference for this reference, this will be set * or ->parent will be set, depending on what type of reference this is. */ u64 ref_root; /* Bytenr of the parent tree block */ u64 parent; union { struct btrfs_data_ref data_ref; struct btrfs_tree_ref tree_ref; }; }; extern struct kmem_cache *btrfs_delayed_ref_head_cachep; extern struct kmem_cache *btrfs_delayed_ref_node_cachep; extern struct kmem_cache *btrfs_delayed_extent_op_cachep; int __init btrfs_delayed_ref_init(void); void __cold btrfs_delayed_ref_exit(void); static inline u64 btrfs_calc_delayed_ref_bytes(const struct btrfs_fs_info *fs_info, int num_delayed_refs) { u64 num_bytes; num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_delayed_refs); /* * We have to check the mount option here because we could be enabling * the free space tree for the first time and don't have the compat_ro * option set yet. * * We need extra reservations if we have the free space tree because * we'll have to modify that tree as well. */ if (btrfs_test_opt(fs_info, FREE_SPACE_TREE)) num_bytes *= 2; return num_bytes; } static inline u64 btrfs_calc_delayed_ref_csum_bytes(const struct btrfs_fs_info *fs_info, int num_csum_items) { /* * Deleting csum items does not result in new nodes/leaves and does not * require changing the free space tree, only the csum tree, so this is * all we need. */ return btrfs_calc_metadata_size(fs_info, num_csum_items); } void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root, bool skip_qgroup); void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset, u64 mod_root, bool skip_qgroup); static inline struct btrfs_delayed_extent_op * btrfs_alloc_delayed_extent_op(void) { return kmem_cache_alloc(btrfs_delayed_extent_op_cachep, GFP_NOFS); } static inline void btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op) { if (op) kmem_cache_free(btrfs_delayed_extent_op_cachep, op); } void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref); static inline u64 btrfs_ref_head_to_space_flags( struct btrfs_delayed_ref_head *head_ref) { if (head_ref->is_data) return BTRFS_BLOCK_GROUP_DATA; else if (head_ref->is_system) return BTRFS_BLOCK_GROUP_SYSTEM; return BTRFS_BLOCK_GROUP_METADATA; } static inline void btrfs_put_delayed_ref_head(struct btrfs_delayed_ref_head *head) { if (refcount_dec_and_test(&head->refs)) kmem_cache_free(btrfs_delayed_ref_head_cachep, head); } int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref, struct btrfs_delayed_extent_op *extent_op); int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref, u64 reserved); int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u8 level, struct btrfs_delayed_extent_op *extent_op); void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head); struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(const struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr); static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) { mutex_unlock(&head->mutex); } void btrfs_delete_ref_head(const struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head); struct btrfs_delayed_ref_head *btrfs_select_ref_head( const struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs); void btrfs_unselect_ref_head(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head); int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr_refs, int nr_csums); void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans); void btrfs_inc_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info); void btrfs_dec_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info); void btrfs_inc_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info); void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info); int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, enum btrfs_reserve_flush_enum flush); bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head, u64 root, u64 parent); void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans); static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node) { if (node->type == BTRFS_EXTENT_DATA_REF_KEY || node->type == BTRFS_SHARED_DATA_REF_KEY) return node->data_ref.objectid; return node->tree_ref.level; } static inline u64 btrfs_delayed_ref_offset(struct btrfs_delayed_ref_node *node) { if (node->type == BTRFS_EXTENT_DATA_REF_KEY || node->type == BTRFS_SHARED_DATA_REF_KEY) return node->data_ref.offset; return 0; } static inline u8 btrfs_ref_type(struct btrfs_ref *ref) { ASSERT(ref->type == BTRFS_REF_DATA || ref->type == BTRFS_REF_METADATA); if (ref->type == BTRFS_REF_DATA) { if (ref->parent) return BTRFS_SHARED_DATA_REF_KEY; else return BTRFS_EXTENT_DATA_REF_KEY; } else { if (ref->parent) return BTRFS_SHARED_BLOCK_REF_KEY; else return BTRFS_TREE_BLOCK_REF_KEY; } return 0; } #endif |
| 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_TRAPS_H #define _ASM_X86_TRAPS_H #include <linux/context_tracking_state.h> #include <linux/kprobes.h> #include <asm/debugreg.h> #include <asm/idtentry.h> #include <asm/siginfo.h> /* TRAP_TRACE, ... */ #include <asm/trap_pf.h> #ifdef CONFIG_X86_64 asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs); asmlinkage __visible notrace struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs); asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *eregs); #endif extern int ibt_selftest(void); extern int ibt_selftest_noendbr(void); #ifdef CONFIG_X86_F00F_BUG /* For handling the FOOF bug */ void handle_invalid_op(struct pt_regs *regs); #endif static inline int get_si_code(unsigned long condition) { if (condition & DR_STEP) return TRAP_TRACE; else if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) return TRAP_HWBKPT; else return TRAP_BRKPT; } extern int panic_on_unrecovered_nmi; void math_emulate(struct math_emu_info *); bool fault_in_kernel_space(unsigned long address); #ifdef CONFIG_VMAP_STACK void __noreturn handle_stack_overflow(struct pt_regs *regs, unsigned long fault_address, struct stack_info *info); #endif static inline void cond_local_irq_enable(struct pt_regs *regs) { if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); } static inline void cond_local_irq_disable(struct pt_regs *regs) { if (regs->flags & X86_EFLAGS_IF) local_irq_disable(); } #endif /* _ASM_X86_TRAPS_H */ |
| 28 27 28 1 2 2 2 2 30 2 19 19 18 1 17 17 17 30 30 2 28 2 2 3 1 1 1 2 2 14 3 10 2 13 13 13 3 3 1 2 2 2 2 2 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2017 Red Hat, Inc. */ #include <linux/cred.h> #include <linux/file.h> #include <linux/mount.h> #include <linux/xattr.h> #include <linux/uio.h> #include <linux/uaccess.h> #include <linux/security.h> #include <linux/fs.h> #include <linux/backing-file.h> #include "overlayfs.h" static char ovl_whatisit(struct inode *inode, struct inode *realinode) { if (realinode != ovl_inode_upper(inode)) return 'l'; if (ovl_has_upperdata(inode)) return 'u'; else return 'm'; } static struct file *ovl_open_realfile(const struct file *file, const struct path *realpath) { struct inode *realinode = d_inode(realpath->dentry); struct inode *inode = file_inode(file); struct mnt_idmap *real_idmap; struct file *realfile; const struct cred *old_cred; int flags = file->f_flags | OVL_OPEN_FLAGS; int acc_mode = ACC_MODE(flags); int err; if (flags & O_APPEND) acc_mode |= MAY_APPEND; old_cred = ovl_override_creds(inode->i_sb); real_idmap = mnt_idmap(realpath->mnt); err = inode_permission(real_idmap, realinode, MAY_OPEN | acc_mode); if (err) { realfile = ERR_PTR(err); } else { if (!inode_owner_or_capable(real_idmap, realinode)) flags &= ~O_NOATIME; realfile = backing_file_open(&file->f_path, flags, realpath, current_cred()); } ovl_revert_creds(old_cred); pr_debug("open(%p[%pD2/%c], 0%o) -> (%p, 0%o)\n", file, file, ovl_whatisit(inode, realinode), file->f_flags, realfile, IS_ERR(realfile) ? 0 : realfile->f_flags); return realfile; } #define OVL_SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT) static int ovl_change_flags(struct file *file, unsigned int flags) { struct inode *inode = file_inode(file); int err; flags &= OVL_SETFL_MASK; if (((flags ^ file->f_flags) & O_APPEND) && IS_APPEND(inode)) return -EPERM; if ((flags & O_DIRECT) && !(file->f_mode & FMODE_CAN_ODIRECT)) return -EINVAL; if (file->f_op->check_flags) { err = file->f_op->check_flags(flags); if (err) return err; } spin_lock(&file->f_lock); file->f_flags = (file->f_flags & ~OVL_SETFL_MASK) | flags; file->f_iocb_flags = iocb_flags(file); spin_unlock(&file->f_lock); return 0; } struct ovl_file { struct file *realfile; struct file *upperfile; }; struct ovl_file *ovl_file_alloc(struct file *realfile) { struct ovl_file *of = kzalloc(sizeof(struct ovl_file), GFP_KERNEL); if (unlikely(!of)) return NULL; of->realfile = realfile; return of; } void ovl_file_free(struct ovl_file *of) { fput(of->realfile); if (of->upperfile) fput(of->upperfile); kfree(of); } static bool ovl_is_real_file(const struct file *realfile, const struct path *realpath) { return file_inode(realfile) == d_inode(realpath->dentry); } static struct file *ovl_real_file_path(const struct file *file, struct path *realpath) { struct ovl_file *of = file->private_data; struct file *realfile = of->realfile; if (WARN_ON_ONCE(!realpath->dentry)) return ERR_PTR(-EIO); /* * If the realfile that we want is not where the data used to be at * open time, either we'd been copied up, or it's an fsync of a * metacopied file. We need the upperfile either way, so see if it * is already opened and if it is not then open and store it. */ if (unlikely(!ovl_is_real_file(realfile, realpath))) { struct file *upperfile = READ_ONCE(of->upperfile); struct file *old; if (!upperfile) { /* Nobody opened upperfile yet */ upperfile = ovl_open_realfile(file, realpath); if (IS_ERR(upperfile)) return upperfile; /* Store the upperfile for later */ old = cmpxchg_release(&of->upperfile, NULL, upperfile); if (old) { /* Someone opened upperfile before us */ fput(upperfile); upperfile = old; } } /* * Stored file must be from the right inode, unless someone's * been corrupting the upper layer. */ if (WARN_ON_ONCE(!ovl_is_real_file(upperfile, realpath))) return ERR_PTR(-EIO); realfile = upperfile; } /* Did the flags change since open? */ if (unlikely((file->f_flags ^ realfile->f_flags) & ~OVL_OPEN_FLAGS)) { int err = ovl_change_flags(realfile, file->f_flags); if (err) return ERR_PTR(err); } return realfile; } static struct file *ovl_real_file(const struct file *file) { struct dentry *dentry = file_dentry(file); struct path realpath; int err; if (d_is_dir(dentry)) { struct file *f = ovl_dir_real_file(file, false); if (WARN_ON_ONCE(!f)) return ERR_PTR(-EIO); return f; } /* lazy lookup and verify of lowerdata */ err = ovl_verify_lowerdata(dentry); if (err) return ERR_PTR(err); ovl_path_realdata(dentry, &realpath); return ovl_real_file_path(file, &realpath); } static int ovl_open(struct inode *inode, struct file *file) { struct dentry *dentry = file_dentry(file); struct file *realfile; struct path realpath; struct ovl_file *of; int err; /* lazy lookup and verify lowerdata */ err = ovl_verify_lowerdata(dentry); if (err) return err; err = ovl_maybe_copy_up(dentry, file->f_flags); if (err) return err; /* No longer need these flags, so don't pass them on to underlying fs */ file->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); ovl_path_realdata(dentry, &realpath); if (!realpath.dentry) return -EIO; realfile = ovl_open_realfile(file, &realpath); if (IS_ERR(realfile)) return PTR_ERR(realfile); of = ovl_file_alloc(realfile); if (!of) { fput(realfile); return -ENOMEM; } file->private_data = of; return 0; } static int ovl_release(struct inode *inode, struct file *file) { ovl_file_free(file->private_data); return 0; } static loff_t ovl_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file_inode(file); struct file *realfile; const struct cred *old_cred; loff_t ret; /* * The two special cases below do not need to involve real fs, * so we can optimizing concurrent callers. */ if (offset == 0) { if (whence == SEEK_CUR) return file->f_pos; if (whence == SEEK_SET) return vfs_setpos(file, 0, 0); } realfile = ovl_real_file(file); if (IS_ERR(realfile)) return PTR_ERR(realfile); /* * Overlay file f_pos is the master copy that is preserved * through copy up and modified on read/write, but only real * fs knows how to SEEK_HOLE/SEEK_DATA and real fs may impose * limitations that are more strict than ->s_maxbytes for specific * files, so we use the real file to perform seeks. */ ovl_inode_lock(inode); realfile->f_pos = file->f_pos; old_cred = ovl_override_creds(inode->i_sb); ret = vfs_llseek(realfile, offset, whence); ovl_revert_creds(old_cred); file->f_pos = realfile->f_pos; ovl_inode_unlock(inode); return ret; } static void ovl_file_modified(struct file *file) { /* Update size/mtime */ ovl_copyattr(file_inode(file)); } static void ovl_file_end_write(struct kiocb *iocb, ssize_t ret) { ovl_file_modified(iocb->ki_filp); } static void ovl_file_accessed(struct file *file) { struct inode *inode, *upperinode; struct timespec64 ctime, uctime; struct timespec64 mtime, umtime; if (file->f_flags & O_NOATIME) return; inode = file_inode(file); upperinode = ovl_inode_upper(inode); if (!upperinode) return; ctime = inode_get_ctime(inode); uctime = inode_get_ctime(upperinode); mtime = inode_get_mtime(inode); umtime = inode_get_mtime(upperinode); if ((!timespec64_equal(&mtime, &umtime)) || !timespec64_equal(&ctime, &uctime)) { inode_set_mtime_to_ts(inode, inode_get_mtime(upperinode)); inode_set_ctime_to_ts(inode, uctime); } touch_atime(&file->f_path); } static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct file *realfile; struct backing_file_ctx ctx = { .cred = ovl_creds(file_inode(file)->i_sb), .accessed = ovl_file_accessed, }; if (!iov_iter_count(iter)) return 0; realfile = ovl_real_file(file); if (IS_ERR(realfile)) return PTR_ERR(realfile); return backing_file_read_iter(realfile, iter, iocb, iocb->ki_flags, &ctx); } static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct file *realfile; ssize_t ret; int ifl = iocb->ki_flags; struct backing_file_ctx ctx = { .cred = ovl_creds(inode->i_sb), .end_write = ovl_file_end_write, }; if (!iov_iter_count(iter)) return 0; inode_lock(inode); /* Update mode */ ovl_copyattr(inode); realfile = ovl_real_file(file); ret = PTR_ERR(realfile); if (IS_ERR(realfile)) goto out_unlock; if (!ovl_should_sync(OVL_FS(inode->i_sb))) ifl &= ~(IOCB_DSYNC | IOCB_SYNC); /* * Overlayfs doesn't support deferred completions, don't copy * this property in case it is set by the issuer. */ ifl &= ~IOCB_DIO_CALLER_COMP; ret = backing_file_write_iter(realfile, iter, iocb, ifl, &ctx); out_unlock: inode_unlock(inode); return ret; } static ssize_t ovl_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct file *realfile; ssize_t ret; struct backing_file_ctx ctx = { .cred = ovl_creds(file_inode(in)->i_sb), .accessed = ovl_file_accessed, }; struct kiocb iocb; realfile = ovl_real_file(in); if (IS_ERR(realfile)) return PTR_ERR(realfile); init_sync_kiocb(&iocb, in); iocb.ki_pos = *ppos; ret = backing_file_splice_read(realfile, &iocb, pipe, len, flags, &ctx); *ppos = iocb.ki_pos; return ret; } /* * Calling iter_file_splice_write() directly from overlay's f_op may deadlock * due to lock order inversion between pipe->mutex in iter_file_splice_write() * and file_start_write(realfile) in ovl_write_iter(). * * So do everything ovl_write_iter() does and call iter_file_splice_write() on * the real file. */ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { struct file *realfile; struct inode *inode = file_inode(out); ssize_t ret; struct backing_file_ctx ctx = { .cred = ovl_creds(inode->i_sb), .end_write = ovl_file_end_write, }; struct kiocb iocb; inode_lock(inode); /* Update mode */ ovl_copyattr(inode); realfile = ovl_real_file(out); ret = PTR_ERR(realfile); if (IS_ERR(realfile)) goto out_unlock; init_sync_kiocb(&iocb, out); iocb.ki_pos = *ppos; ret = backing_file_splice_write(pipe, realfile, &iocb, len, flags, &ctx); *ppos = iocb.ki_pos; out_unlock: inode_unlock(inode); return ret; } static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct dentry *dentry = file_dentry(file); enum ovl_path_type type; struct path upperpath; struct file *upperfile; const struct cred *old_cred; int ret; ret = ovl_sync_status(OVL_FS(file_inode(file)->i_sb)); if (ret <= 0) return ret; /* Don't sync lower file for fear of receiving EROFS error */ type = ovl_path_type(dentry); if (!OVL_TYPE_UPPER(type) || (datasync && OVL_TYPE_MERGE(type))) return 0; ovl_path_upper(dentry, &upperpath); upperfile = ovl_real_file_path(file, &upperpath); if (IS_ERR(upperfile)) return PTR_ERR(upperfile); old_cred = ovl_override_creds(file_inode(file)->i_sb); ret = vfs_fsync_range(upperfile, start, end, datasync); ovl_revert_creds(old_cred); return ret; } static int ovl_mmap(struct file *file, struct vm_area_struct *vma) { struct ovl_file *of = file->private_data; struct backing_file_ctx ctx = { .cred = ovl_creds(file_inode(file)->i_sb), .accessed = ovl_file_accessed, }; return backing_file_mmap(of->realfile, vma, &ctx); } static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct file *realfile; const struct cred *old_cred; int ret; inode_lock(inode); /* Update mode */ ovl_copyattr(inode); ret = file_remove_privs(file); if (ret) goto out_unlock; realfile = ovl_real_file(file); ret = PTR_ERR(realfile); if (IS_ERR(realfile)) goto out_unlock; old_cred = ovl_override_creds(file_inode(file)->i_sb); ret = vfs_fallocate(realfile, mode, offset, len); ovl_revert_creds(old_cred); /* Update size */ ovl_file_modified(file); out_unlock: inode_unlock(inode); return ret; } static int ovl_fadvise(struct file *file, loff_t offset, loff_t len, int advice) { struct file *realfile; const struct cred *old_cred; int ret; realfile = ovl_real_file(file); if (IS_ERR(realfile)) return PTR_ERR(realfile); old_cred = ovl_override_creds(file_inode(file)->i_sb); ret = vfs_fadvise(realfile, offset, len, advice); ovl_revert_creds(old_cred); return ret; } enum ovl_copyop { OVL_COPY, OVL_CLONE, OVL_DEDUPE, }; static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int flags, enum ovl_copyop op) { struct inode *inode_out = file_inode(file_out); struct file *realfile_in, *realfile_out; const struct cred *old_cred; loff_t ret; inode_lock(inode_out); if (op != OVL_DEDUPE) { /* Update mode */ ovl_copyattr(inode_out); ret = file_remove_privs(file_out); if (ret) goto out_unlock; } realfile_out = ovl_real_file(file_out); ret = PTR_ERR(realfile_out); if (IS_ERR(realfile_out)) goto out_unlock; realfile_in = ovl_real_file(file_in); ret = PTR_ERR(realfile_in); if (IS_ERR(realfile_in)) goto out_unlock; old_cred = ovl_override_creds(file_inode(file_out)->i_sb); switch (op) { case OVL_COPY: ret = vfs_copy_file_range(realfile_in, pos_in, realfile_out, pos_out, len, flags); break; case OVL_CLONE: ret = vfs_clone_file_range(realfile_in, pos_in, realfile_out, pos_out, len, flags); break; case OVL_DEDUPE: ret = vfs_dedupe_file_range_one(realfile_in, pos_in, realfile_out, pos_out, len, flags); break; } ovl_revert_creds(old_cred); /* Update size */ ovl_file_modified(file_out); out_unlock: inode_unlock(inode_out); return ret; } static ssize_t ovl_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t len, unsigned int flags) { return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, flags, OVL_COPY); } static loff_t ovl_remap_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags) { enum ovl_copyop op; if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) return -EINVAL; if (remap_flags & REMAP_FILE_DEDUP) op = OVL_DEDUPE; else op = OVL_CLONE; /* * Don't copy up because of a dedupe request, this wouldn't make sense * most of the time (data would be duplicated instead of deduplicated). */ if (op == OVL_DEDUPE && (!ovl_inode_upper(file_inode(file_in)) || !ovl_inode_upper(file_inode(file_out)))) return -EPERM; return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, remap_flags, op); } static int ovl_flush(struct file *file, fl_owner_t id) { struct file *realfile; const struct cred *old_cred; int err = 0; realfile = ovl_real_file(file); if (IS_ERR(realfile)) return PTR_ERR(realfile); if (realfile->f_op->flush) { old_cred = ovl_override_creds(file_inode(file)->i_sb); err = realfile->f_op->flush(realfile, id); ovl_revert_creds(old_cred); } return err; } const struct file_operations ovl_file_operations = { .open = ovl_open, .release = ovl_release, .llseek = ovl_llseek, .read_iter = ovl_read_iter, .write_iter = ovl_write_iter, .fsync = ovl_fsync, .mmap = ovl_mmap, .fallocate = ovl_fallocate, .fadvise = ovl_fadvise, .flush = ovl_flush, .splice_read = ovl_splice_read, .splice_write = ovl_splice_write, .copy_file_range = ovl_copy_file_range, .remap_file_range = ovl_remap_file_range, }; |
| 735 733 557 557 64 64 437 3 3 3 2 3 1 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 2 3 3 1 1 3 294 436 437 437 143 294 436 437 139 434 437 64 64 64 64 64 64 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 | // SPDX-License-Identifier: GPL-2.0-only /* * Resizable, Scalable, Concurrent Hash Table * * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au> * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch> * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net> * * Code partially derived from nft_hash * Rewritten with rehash code from br_multicast plus single list * pointer as suggested by Josh Triplett */ #include <linux/atomic.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/log2.h> #include <linux/sched.h> #include <linux/rculist.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/mm.h> #include <linux/jhash.h> #include <linux/random.h> #include <linux/rhashtable.h> #include <linux/err.h> #include <linux/export.h> #define HASH_DEFAULT_SIZE 64UL #define HASH_MIN_SIZE 4U union nested_table { union nested_table __rcu *table; struct rhash_lock_head __rcu *bucket; }; static u32 head_hashfn(struct rhashtable *ht, const struct bucket_table *tbl, const struct rhash_head *he) { return rht_head_hashfn(ht, tbl, he, ht->p); } #ifdef CONFIG_PROVE_LOCKING #define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT)) int lockdep_rht_mutex_is_held(struct rhashtable *ht) { return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1; } EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held); int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash) { if (!debug_locks) return 1; if (unlikely(tbl->nest)) return 1; return bit_spin_is_locked(0, (unsigned long *)&tbl->buckets[hash]); } EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held); #else #define ASSERT_RHT_MUTEX(HT) #endif static inline union nested_table *nested_table_top( const struct bucket_table *tbl) { /* The top-level bucket entry does not need RCU protection * because it's set at the same time as tbl->nest. */ return (void *)rcu_dereference_protected(tbl->buckets[0], 1); } static void nested_table_free(union nested_table *ntbl, unsigned int size) { const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); const unsigned int len = 1 << shift; unsigned int i; ntbl = rcu_dereference_protected(ntbl->table, 1); if (!ntbl) return; if (size > len) { size >>= shift; for (i = 0; i < len; i++) nested_table_free(ntbl + i, size); } kfree(ntbl); } static void nested_bucket_table_free(const struct bucket_table *tbl) { unsigned int size = tbl->size >> tbl->nest; unsigned int len = 1 << tbl->nest; union nested_table *ntbl; unsigned int i; ntbl = nested_table_top(tbl); for (i = 0; i < len; i++) nested_table_free(ntbl + i, size); kfree(ntbl); } static void bucket_table_free(const struct bucket_table *tbl) { if (tbl->nest) nested_bucket_table_free(tbl); kvfree(tbl); } static void bucket_table_free_rcu(struct rcu_head *head) { bucket_table_free(container_of(head, struct bucket_table, rcu)); } static union nested_table *nested_table_alloc(struct rhashtable *ht, union nested_table __rcu **prev, bool leaf) { union nested_table *ntbl; int i; ntbl = rcu_dereference(*prev); if (ntbl) return ntbl; ntbl = alloc_hooks_tag(ht->alloc_tag, kmalloc_noprof(PAGE_SIZE, GFP_ATOMIC|__GFP_ZERO)); if (ntbl && leaf) { for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0]); i++) INIT_RHT_NULLS_HEAD(ntbl[i].bucket); } if (cmpxchg((union nested_table **)prev, NULL, ntbl) == NULL) return ntbl; /* Raced with another thread. */ kfree(ntbl); return rcu_dereference(*prev); } static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht, size_t nbuckets, gfp_t gfp) { const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); struct bucket_table *tbl; size_t size; if (nbuckets < (1 << (shift + 1))) return NULL; size = sizeof(*tbl) + sizeof(tbl->buckets[0]); tbl = alloc_hooks_tag(ht->alloc_tag, kmalloc_noprof(size, gfp|__GFP_ZERO)); if (!tbl) return NULL; if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets, false)) { kfree(tbl); return NULL; } tbl->nest = (ilog2(nbuckets) - 1) % shift + 1; return tbl; } static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, size_t nbuckets, gfp_t gfp) { struct bucket_table *tbl = NULL; size_t size; int i; static struct lock_class_key __key; tbl = alloc_hooks_tag(ht->alloc_tag, kvmalloc_node_noprof(struct_size(tbl, buckets, nbuckets), gfp|__GFP_ZERO, NUMA_NO_NODE)); size = nbuckets; if (tbl == NULL && !gfpflags_allow_blocking(gfp)) { tbl = nested_bucket_table_alloc(ht, nbuckets, gfp); nbuckets = 0; } if (tbl == NULL) return NULL; lockdep_init_map(&tbl->dep_map, "rhashtable_bucket", &__key, 0); tbl->size = size; rcu_head_init(&tbl->rcu); INIT_LIST_HEAD(&tbl->walkers); tbl->hash_rnd = get_random_u32(); for (i = 0; i < nbuckets; i++) INIT_RHT_NULLS_HEAD(tbl->buckets[i]); return tbl; } static struct bucket_table *rhashtable_last_table(struct rhashtable *ht, struct bucket_table *tbl) { struct bucket_table *new_tbl; do { new_tbl = tbl; tbl = rht_dereference_rcu(tbl->future_tbl, ht); } while (tbl); return new_tbl; } static int rhashtable_rehash_one(struct rhashtable *ht, struct rhash_lock_head __rcu **bkt, unsigned int old_hash) { struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); struct bucket_table *new_tbl = rhashtable_last_table(ht, old_tbl); int err = -EAGAIN; struct rhash_head *head, *next, *entry; struct rhash_head __rcu **pprev = NULL; unsigned int new_hash; unsigned long flags; if (new_tbl->nest) goto out; err = -ENOENT; rht_for_each_from(entry, rht_ptr(bkt, old_tbl, old_hash), old_tbl, old_hash) { err = 0; next = rht_dereference_bucket(entry->next, old_tbl, old_hash); if (rht_is_a_nulls(next)) break; pprev = &entry->next; } if (err) goto out; new_hash = head_hashfn(ht, new_tbl, entry); flags = rht_lock_nested(new_tbl, &new_tbl->buckets[new_hash], SINGLE_DEPTH_NESTING); head = rht_ptr(new_tbl->buckets + new_hash, new_tbl, new_hash); RCU_INIT_POINTER(entry->next, head); rht_assign_unlock(new_tbl, &new_tbl->buckets[new_hash], entry, flags); if (pprev) rcu_assign_pointer(*pprev, next); else /* Need to preserved the bit lock. */ rht_assign_locked(bkt, next); out: return err; } static int rhashtable_rehash_chain(struct rhashtable *ht, unsigned int old_hash) { struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); struct rhash_lock_head __rcu **bkt = rht_bucket_var(old_tbl, old_hash); unsigned long flags; int err; if (!bkt) return 0; flags = rht_lock(old_tbl, bkt); while (!(err = rhashtable_rehash_one(ht, bkt, old_hash))) ; if (err == -ENOENT) err = 0; rht_unlock(old_tbl, bkt, flags); return err; } static int rhashtable_rehash_attach(struct rhashtable *ht, struct bucket_table *old_tbl, struct bucket_table *new_tbl) { /* Make insertions go into the new, empty table right away. Deletions * and lookups will be attempted in both tables until we synchronize. * As cmpxchg() provides strong barriers, we do not need * rcu_assign_pointer(). */ if (cmpxchg((struct bucket_table **)&old_tbl->future_tbl, NULL, new_tbl) != NULL) return -EEXIST; return 0; } static int rhashtable_rehash_table(struct rhashtable *ht) { struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); struct bucket_table *new_tbl; struct rhashtable_walker *walker; unsigned int old_hash; int err; new_tbl = rht_dereference(old_tbl->future_tbl, ht); if (!new_tbl) return 0; for (old_hash = 0; old_hash < old_tbl->size; old_hash++) { err = rhashtable_rehash_chain(ht, old_hash); if (err) return err; cond_resched(); } /* Publish the new table pointer. */ rcu_assign_pointer(ht->tbl, new_tbl); spin_lock(&ht->lock); list_for_each_entry(walker, &old_tbl->walkers, list) walker->tbl = NULL; /* Wait for readers. All new readers will see the new * table, and thus no references to the old table will * remain. * We do this inside the locked region so that * rhashtable_walk_stop() can use rcu_head_after_call_rcu() * to check if it should not re-link the table. */ call_rcu(&old_tbl->rcu, bucket_table_free_rcu); spin_unlock(&ht->lock); return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0; } static int rhashtable_rehash_alloc(struct rhashtable *ht, struct bucket_table *old_tbl, unsigned int size) { struct bucket_table *new_tbl; int err; ASSERT_RHT_MUTEX(ht); new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL); if (new_tbl == NULL) return -ENOMEM; err = rhashtable_rehash_attach(ht, old_tbl, new_tbl); if (err) bucket_table_free(new_tbl); return err; } /** * rhashtable_shrink - Shrink hash table while allowing concurrent lookups * @ht: the hash table to shrink * * This function shrinks the hash table to fit, i.e., the smallest * size would not cause it to expand right away automatically. * * The caller must ensure that no concurrent resizing occurs by holding * ht->mutex. * * The caller must ensure that no concurrent table mutations take place. * It is however valid to have concurrent lookups if they are RCU protected. * * It is valid to have concurrent insertions and deletions protected by per * bucket locks or concurrent RCU protected lookups and traversals. */ static int rhashtable_shrink(struct rhashtable *ht) { struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); unsigned int nelems = atomic_read(&ht->nelems); unsigned int size = 0; if (nelems) size = roundup_pow_of_two(nelems * 3 / 2); if (size < ht->p.min_size) size = ht->p.min_size; if (old_tbl->size <= size) return 0; if (rht_dereference(old_tbl->future_tbl, ht)) return -EEXIST; return rhashtable_rehash_alloc(ht, old_tbl, size); } static void rht_deferred_worker(struct work_struct *work) { struct rhashtable *ht; struct bucket_table *tbl; int err = 0; ht = container_of(work, struct rhashtable, run_work); mutex_lock(&ht->mutex); tbl = rht_dereference(ht->tbl, ht); tbl = rhashtable_last_table(ht, tbl); if (rht_grow_above_75(ht, tbl)) err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2); else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl)) err = rhashtable_shrink(ht); else if (tbl->nest) err = rhashtable_rehash_alloc(ht, tbl, tbl->size); if (!err || err == -EEXIST) { int nerr; nerr = rhashtable_rehash_table(ht); err = err ?: nerr; } mutex_unlock(&ht->mutex); if (err) schedule_work(&ht->run_work); } static int rhashtable_insert_rehash(struct rhashtable *ht, struct bucket_table *tbl) { struct bucket_table *old_tbl; struct bucket_table *new_tbl; unsigned int size; int err; old_tbl = rht_dereference_rcu(ht->tbl, ht); size = tbl->size; err = -EBUSY; if (rht_grow_above_75(ht, tbl)) size *= 2; /* Do not schedule more than one rehash */ else if (old_tbl != tbl) goto fail; err = -ENOMEM; new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC | __GFP_NOWARN); if (new_tbl == NULL) goto fail; err = rhashtable_rehash_attach(ht, tbl, new_tbl); if (err) { bucket_table_free(new_tbl); if (err == -EEXIST) err = 0; } else schedule_work(&ht->run_work); return err; fail: /* Do not fail the insert if someone else did a rehash. */ if (likely(rcu_access_pointer(tbl->future_tbl))) return 0; /* Schedule async rehash to retry allocation in process context. */ if (err == -ENOMEM) schedule_work(&ht->run_work); return err; } static void *rhashtable_lookup_one(struct rhashtable *ht, struct rhash_lock_head __rcu **bkt, struct bucket_table *tbl, unsigned int hash, const void *key, struct rhash_head *obj) { struct rhashtable_compare_arg arg = { .ht = ht, .key = key, }; struct rhash_head __rcu **pprev = NULL; struct rhash_head *head; int elasticity; elasticity = RHT_ELASTICITY; rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) { struct rhlist_head *list; struct rhlist_head *plist; elasticity--; if (!key || (ht->p.obj_cmpfn ? ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) : rhashtable_compare(&arg, rht_obj(ht, head)))) { pprev = &head->next; continue; } if (!ht->rhlist) return rht_obj(ht, head); list = container_of(obj, struct rhlist_head, rhead); plist = container_of(head, struct rhlist_head, rhead); RCU_INIT_POINTER(list->next, plist); head = rht_dereference_bucket(head->next, tbl, hash); RCU_INIT_POINTER(list->rhead.next, head); if (pprev) rcu_assign_pointer(*pprev, obj); else /* Need to preserve the bit lock */ rht_assign_locked(bkt, obj); return NULL; } if (elasticity <= 0) return ERR_PTR(-EAGAIN); return ERR_PTR(-ENOENT); } static struct bucket_table *rhashtable_insert_one( struct rhashtable *ht, struct rhash_lock_head __rcu **bkt, struct bucket_table *tbl, unsigned int hash, struct rhash_head *obj, void *data) { struct bucket_table *new_tbl; struct rhash_head *head; if (!IS_ERR_OR_NULL(data)) return ERR_PTR(-EEXIST); if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT) return ERR_CAST(data); new_tbl = rht_dereference_rcu(tbl->future_tbl, ht); if (new_tbl) return new_tbl; if (PTR_ERR(data) != -ENOENT) return ERR_CAST(data); if (unlikely(rht_grow_above_max(ht, tbl))) return ERR_PTR(-E2BIG); if (unlikely(rht_grow_above_100(ht, tbl))) return ERR_PTR(-EAGAIN); head = rht_ptr(bkt, tbl, hash); RCU_INIT_POINTER(obj->next, head); if (ht->rhlist) { struct rhlist_head *list; list = container_of(obj, struct rhlist_head, rhead); RCU_INIT_POINTER(list->next, NULL); } /* bkt is always the head of the list, so it holds * the lock, which we need to preserve */ rht_assign_locked(bkt, obj); atomic_inc(&ht->nelems); if (rht_grow_above_75(ht, tbl)) schedule_work(&ht->run_work); return NULL; } static void *rhashtable_try_insert(struct rhashtable *ht, const void *key, struct rhash_head *obj) { struct bucket_table *new_tbl; struct bucket_table *tbl; struct rhash_lock_head __rcu **bkt; unsigned long flags; unsigned int hash; void *data; new_tbl = rcu_dereference(ht->tbl); do { tbl = new_tbl; hash = rht_head_hashfn(ht, tbl, obj, ht->p); if (rcu_access_pointer(tbl->future_tbl)) /* Failure is OK */ bkt = rht_bucket_var(tbl, hash); else bkt = rht_bucket_insert(ht, tbl, hash); if (bkt == NULL) { new_tbl = rht_dereference_rcu(tbl->future_tbl, ht); data = ERR_PTR(-EAGAIN); } else { flags = rht_lock(tbl, bkt); data = rhashtable_lookup_one(ht, bkt, tbl, hash, key, obj); new_tbl = rhashtable_insert_one(ht, bkt, tbl, hash, obj, data); if (PTR_ERR(new_tbl) != -EEXIST) data = ERR_CAST(new_tbl); rht_unlock(tbl, bkt, flags); } } while (!IS_ERR_OR_NULL(new_tbl)); if (PTR_ERR(data) == -EAGAIN) data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?: -EAGAIN); return data; } void *rhashtable_insert_slow(struct rhashtable *ht, const void *key, struct rhash_head *obj) { void *data; do { rcu_read_lock(); data = rhashtable_try_insert(ht, key, obj); rcu_read_unlock(); } while (PTR_ERR(data) == -EAGAIN); return data; } EXPORT_SYMBOL_GPL(rhashtable_insert_slow); /** * rhashtable_walk_enter - Initialise an iterator * @ht: Table to walk over * @iter: Hash table Iterator * * This function prepares a hash table walk. * * Note that if you restart a walk after rhashtable_walk_stop you * may see the same object twice. Also, you may miss objects if * there are removals in between rhashtable_walk_stop and the next * call to rhashtable_walk_start. * * For a completely stable walk you should construct your own data * structure outside the hash table. * * This function may be called from any process context, including * non-preemptable context, but cannot be called from softirq or * hardirq context. * * You must call rhashtable_walk_exit after this function returns. */ void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter) { iter->ht = ht; iter->p = NULL; iter->slot = 0; iter->skip = 0; iter->end_of_table = 0; spin_lock(&ht->lock); iter->walker.tbl = rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock)); list_add(&iter->walker.list, &iter->walker.tbl->walkers); spin_unlock(&ht->lock); } EXPORT_SYMBOL_GPL(rhashtable_walk_enter); /** * rhashtable_walk_exit - Free an iterator * @iter: Hash table Iterator * * This function frees resources allocated by rhashtable_walk_enter. */ void rhashtable_walk_exit(struct rhashtable_iter *iter) { spin_lock(&iter->ht->lock); if (iter->walker.tbl) list_del(&iter->walker.list); spin_unlock(&iter->ht->lock); } EXPORT_SYMBOL_GPL(rhashtable_walk_exit); /** * rhashtable_walk_start_check - Start a hash table walk * @iter: Hash table iterator * * Start a hash table walk at the current iterator position. Note that we take * the RCU lock in all cases including when we return an error. So you must * always call rhashtable_walk_stop to clean up. * * Returns zero if successful. * * Returns -EAGAIN if resize event occurred. Note that the iterator * will rewind back to the beginning and you may use it immediately * by calling rhashtable_walk_next. * * rhashtable_walk_start is defined as an inline variant that returns * void. This is preferred in cases where the caller would ignore * resize events and always continue. */ int rhashtable_walk_start_check(struct rhashtable_iter *iter) __acquires(RCU) { struct rhashtable *ht = iter->ht; bool rhlist = ht->rhlist; rcu_read_lock(); spin_lock(&ht->lock); if (iter->walker.tbl) list_del(&iter->walker.list); spin_unlock(&ht->lock); if (iter->end_of_table) return 0; if (!iter->walker.tbl) { iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht); iter->slot = 0; iter->skip = 0; return -EAGAIN; } if (iter->p && !rhlist) { /* * We need to validate that 'p' is still in the table, and * if so, update 'skip' */ struct rhash_head *p; int skip = 0; rht_for_each_rcu(p, iter->walker.tbl, iter->slot) { skip++; if (p == iter->p) { iter->skip = skip; goto found; } } iter->p = NULL; } else if (iter->p && rhlist) { /* Need to validate that 'list' is still in the table, and * if so, update 'skip' and 'p'. */ struct rhash_head *p; struct rhlist_head *list; int skip = 0; rht_for_each_rcu(p, iter->walker.tbl, iter->slot) { for (list = container_of(p, struct rhlist_head, rhead); list; list = rcu_dereference(list->next)) { skip++; if (list == iter->list) { iter->p = p; iter->skip = skip; goto found; } } } iter->p = NULL; } found: return 0; } EXPORT_SYMBOL_GPL(rhashtable_walk_start_check); /** * __rhashtable_walk_find_next - Find the next element in a table (or the first * one in case of a new walk). * * @iter: Hash table iterator * * Returns the found object or NULL when the end of the table is reached. * * Returns -EAGAIN if resize event occurred. */ static void *__rhashtable_walk_find_next(struct rhashtable_iter *iter) { struct bucket_table *tbl = iter->walker.tbl; struct rhlist_head *list = iter->list; struct rhashtable *ht = iter->ht; struct rhash_head *p = iter->p; bool rhlist = ht->rhlist; if (!tbl) return NULL; for (; iter->slot < tbl->size; iter->slot++) { int skip = iter->skip; rht_for_each_rcu(p, tbl, iter->slot) { if (rhlist) { list = container_of(p, struct rhlist_head, rhead); do { if (!skip) goto next; skip--; list = rcu_dereference(list->next); } while (list); continue; } if (!skip) break; skip--; } next: if (!rht_is_a_nulls(p)) { iter->skip++; iter->p = p; iter->list = list; return rht_obj(ht, rhlist ? &list->rhead : p); } iter->skip = 0; } iter->p = NULL; /* Ensure we see any new tables. */ smp_rmb(); iter->walker.tbl = rht_dereference_rcu(tbl->future_tbl, ht); if (iter->walker.tbl) { iter->slot = 0; iter->skip = 0; return ERR_PTR(-EAGAIN); } else { iter->end_of_table = true; } return NULL; } /** * rhashtable_walk_next - Return the next object and advance the iterator * @iter: Hash table iterator * * Note that you must call rhashtable_walk_stop when you are finished * with the walk. * * Returns the next object or NULL when the end of the table is reached. * * Returns -EAGAIN if resize event occurred. Note that the iterator * will rewind back to the beginning and you may continue to use it. */ void *rhashtable_walk_next(struct rhashtable_iter *iter) { struct rhlist_head *list = iter->list; struct rhashtable *ht = iter->ht; struct rhash_head *p = iter->p; bool rhlist = ht->rhlist; if (p) { if (!rhlist || !(list = rcu_dereference(list->next))) { p = rcu_dereference(p->next); list = container_of(p, struct rhlist_head, rhead); } if (!rht_is_a_nulls(p)) { iter->skip++; iter->p = p; iter->list = list; return rht_obj(ht, rhlist ? &list->rhead : p); } /* At the end of this slot, switch to next one and then find * next entry from that point. */ iter->skip = 0; iter->slot++; } return __rhashtable_walk_find_next(iter); } EXPORT_SYMBOL_GPL(rhashtable_walk_next); /** * rhashtable_walk_peek - Return the next object but don't advance the iterator * @iter: Hash table iterator * * Returns the next object or NULL when the end of the table is reached. * * Returns -EAGAIN if resize event occurred. Note that the iterator * will rewind back to the beginning and you may continue to use it. */ void *rhashtable_walk_peek(struct rhashtable_iter *iter) { struct rhlist_head *list = iter->list; struct rhashtable *ht = iter->ht; struct rhash_head *p = iter->p; if (p) return rht_obj(ht, ht->rhlist ? &list->rhead : p); /* No object found in current iter, find next one in the table. */ if (iter->skip) { /* A nonzero skip value points to the next entry in the table * beyond that last one that was found. Decrement skip so * we find the current value. __rhashtable_walk_find_next * will restore the original value of skip assuming that * the table hasn't changed. */ iter->skip--; } return __rhashtable_walk_find_next(iter); } EXPORT_SYMBOL_GPL(rhashtable_walk_peek); /** * rhashtable_walk_stop - Finish a hash table walk * @iter: Hash table iterator * * Finish a hash table walk. Does not reset the iterator to the start of the * hash table. */ void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU) { struct rhashtable *ht; struct bucket_table *tbl = iter->walker.tbl; if (!tbl) goto out; ht = iter->ht; spin_lock(&ht->lock); if (rcu_head_after_call_rcu(&tbl->rcu, bucket_table_free_rcu)) /* This bucket table is being freed, don't re-link it. */ iter->walker.tbl = NULL; else list_add(&iter->walker.list, &tbl->walkers); spin_unlock(&ht->lock); out: rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rhashtable_walk_stop); static size_t rounded_hashtable_size(const struct rhashtable_params *params) { size_t retsize; if (params->nelem_hint) retsize = max(roundup_pow_of_two(params->nelem_hint * 4 / 3), (unsigned long)params->min_size); else retsize = max(HASH_DEFAULT_SIZE, (unsigned long)params->min_size); return retsize; } static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed) { return jhash2(key, length, seed); } /** * rhashtable_init - initialize a new hash table * @ht: hash table to be initialized * @params: configuration parameters * * Initializes a new hash table based on the provided configuration * parameters. A table can be configured either with a variable or * fixed length key: * * Configuration Example 1: Fixed length keys * struct test_obj { * int key; * void * my_member; * struct rhash_head node; * }; * * struct rhashtable_params params = { * .head_offset = offsetof(struct test_obj, node), * .key_offset = offsetof(struct test_obj, key), * .key_len = sizeof(int), * .hashfn = jhash, * }; * * Configuration Example 2: Variable length keys * struct test_obj { * [...] * struct rhash_head node; * }; * * u32 my_hash_fn(const void *data, u32 len, u32 seed) * { * struct test_obj *obj = data; * * return [... hash ...]; * } * * struct rhashtable_params params = { * .head_offset = offsetof(struct test_obj, node), * .hashfn = jhash, * .obj_hashfn = my_hash_fn, * }; */ int rhashtable_init_noprof(struct rhashtable *ht, const struct rhashtable_params *params) { struct bucket_table *tbl; size_t size; if ((!params->key_len && !params->obj_hashfn) || (params->obj_hashfn && !params->obj_cmpfn)) return -EINVAL; memset(ht, 0, sizeof(*ht)); mutex_init(&ht->mutex); spin_lock_init(&ht->lock); memcpy(&ht->p, params, sizeof(*params)); alloc_tag_record(ht->alloc_tag); if (params->min_size) ht->p.min_size = roundup_pow_of_two(params->min_size); /* Cap total entries at 2^31 to avoid nelems overflow. */ ht->max_elems = 1u << 31; if (params->max_size) { ht->p.max_size = rounddown_pow_of_two(params->max_size); if (ht->p.max_size < ht->max_elems / 2) ht->max_elems = ht->p.max_size * 2; } ht->p.min_size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE); size = rounded_hashtable_size(&ht->p); ht->key_len = ht->p.key_len; if (!params->hashfn) { ht->p.hashfn = jhash; if (!(ht->key_len & (sizeof(u32) - 1))) { ht->key_len /= sizeof(u32); ht->p.hashfn = rhashtable_jhash2; } } /* * This is api initialization and thus we need to guarantee the * initial rhashtable allocation. Upon failure, retry with the * smallest possible size with __GFP_NOFAIL semantics. */ tbl = bucket_table_alloc(ht, size, GFP_KERNEL); if (unlikely(tbl == NULL)) { size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE); tbl = bucket_table_alloc(ht, size, GFP_KERNEL | __GFP_NOFAIL); } atomic_set(&ht->nelems, 0); RCU_INIT_POINTER(ht->tbl, tbl); INIT_WORK(&ht->run_work, rht_deferred_worker); return 0; } EXPORT_SYMBOL_GPL(rhashtable_init_noprof); /** * rhltable_init - initialize a new hash list table * @hlt: hash list table to be initialized * @params: configuration parameters * * Initializes a new hash list table. * * See documentation for rhashtable_init. */ int rhltable_init_noprof(struct rhltable *hlt, const struct rhashtable_params *params) { int err; err = rhashtable_init_noprof(&hlt->ht, params); hlt->ht.rhlist = true; return err; } EXPORT_SYMBOL_GPL(rhltable_init_noprof); static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj, void (*free_fn)(void *ptr, void *arg), void *arg) { struct rhlist_head *list; if (!ht->rhlist) { free_fn(rht_obj(ht, obj), arg); return; } list = container_of(obj, struct rhlist_head, rhead); do { obj = &list->rhead; list = rht_dereference(list->next, ht); free_fn(rht_obj(ht, obj), arg); } while (list); } /** * rhashtable_free_and_destroy - free elements and destroy hash table * @ht: the hash table to destroy * @free_fn: callback to release resources of element * @arg: pointer passed to free_fn * * Stops an eventual async resize. If defined, invokes free_fn for each * element to releasal resources. Please note that RCU protected * readers may still be accessing the elements. Releasing of resources * must occur in a compatible manner. Then frees the bucket array. * * This function will eventually sleep to wait for an async resize * to complete. The caller is responsible that no further write operations * occurs in parallel. */ void rhashtable_free_and_destroy(struct rhashtable *ht, void (*free_fn)(void *ptr, void *arg), void *arg) { struct bucket_table *tbl, *next_tbl; unsigned int i; cancel_work_sync(&ht->run_work); mutex_lock(&ht->mutex); tbl = rht_dereference(ht->tbl, ht); restart: if (free_fn) { for (i = 0; i < tbl->size; i++) { struct rhash_head *pos, *next; cond_resched(); for (pos = rht_ptr_exclusive(rht_bucket(tbl, i)), next = !rht_is_a_nulls(pos) ? rht_dereference(pos->next, ht) : NULL; !rht_is_a_nulls(pos); pos = next, next = !rht_is_a_nulls(pos) ? rht_dereference(pos->next, ht) : NULL) rhashtable_free_one(ht, pos, free_fn, arg); } } next_tbl = rht_dereference(tbl->future_tbl, ht); bucket_table_free(tbl); if (next_tbl) { tbl = next_tbl; goto restart; } mutex_unlock(&ht->mutex); } EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy); void rhashtable_destroy(struct rhashtable *ht) { return rhashtable_free_and_destroy(ht, NULL, NULL); } EXPORT_SYMBOL_GPL(rhashtable_destroy); struct rhash_lock_head __rcu **__rht_bucket_nested( const struct bucket_table *tbl, unsigned int hash) { const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); unsigned int index = hash & ((1 << tbl->nest) - 1); unsigned int size = tbl->size >> tbl->nest; unsigned int subhash = hash; union nested_table *ntbl; ntbl = nested_table_top(tbl); ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash); subhash >>= tbl->nest; while (ntbl && size > (1 << shift)) { index = subhash & ((1 << shift) - 1); ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash); size >>= shift; subhash >>= shift; } if (!ntbl) return NULL; return &ntbl[subhash].bucket; } EXPORT_SYMBOL_GPL(__rht_bucket_nested); struct rhash_lock_head __rcu **rht_bucket_nested( const struct bucket_table *tbl, unsigned int hash) { static struct rhash_lock_head __rcu *rhnull; if (!rhnull) INIT_RHT_NULLS_HEAD(rhnull); return __rht_bucket_nested(tbl, hash) ?: &rhnull; } EXPORT_SYMBOL_GPL(rht_bucket_nested); struct rhash_lock_head __rcu **rht_bucket_nested_insert( struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash) { const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); unsigned int index = hash & ((1 << tbl->nest) - 1); unsigned int size = tbl->size >> tbl->nest; union nested_table *ntbl; ntbl = nested_table_top(tbl); hash >>= tbl->nest; ntbl = nested_table_alloc(ht, &ntbl[index].table, size <= (1 << shift)); while (ntbl && size > (1 << shift)) { index = hash & ((1 << shift) - 1); size >>= shift; hash >>= shift; ntbl = nested_table_alloc(ht, &ntbl[index].table, size <= (1 << shift)); } if (!ntbl) return NULL; return &ntbl[hash].bucket; } EXPORT_SYMBOL_GPL(rht_bucket_nested_insert); |
| 8 8 5 10 2 8 8 8 5 9 9 244 244 254 1 1 14 14 308 3 3 3 2 3 3 3 3 3 2 2 2 2 2 8 8 8 8 8 9 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 | // SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2019 Oracle. All Rights Reserved. * Author: Darrick J. Wong <darrick.wong@oracle.com> */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trace.h" #include "xfs_health.h" #include "xfs_ag.h" #include "xfs_btree.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_quota_defs.h" #include "xfs_rtgroup.h" static void xfs_health_unmount_group( struct xfs_group *xg, bool *warn) { unsigned int sick = 0; unsigned int checked = 0; xfs_group_measure_sickness(xg, &sick, &checked); if (sick) { trace_xfs_group_unfixed_corruption(xg, sick); *warn = true; } } /* * Warn about metadata corruption that we detected but haven't fixed, and * make sure we're not sitting on anything that would get in the way of * recovery. */ void xfs_health_unmount( struct xfs_mount *mp) { struct xfs_perag *pag = NULL; struct xfs_rtgroup *rtg = NULL; unsigned int sick = 0; unsigned int checked = 0; bool warn = false; if (xfs_is_shutdown(mp)) return; /* Measure AG corruption levels. */ while ((pag = xfs_perag_next(mp, pag))) xfs_health_unmount_group(pag_group(pag), &warn); /* Measure realtime group corruption levels. */ while ((rtg = xfs_rtgroup_next(mp, rtg))) xfs_health_unmount_group(rtg_group(rtg), &warn); /* * Measure fs corruption and keep the sample around for the warning. * See the note below for why we exempt FS_COUNTERS. */ xfs_fs_measure_sickness(mp, &sick, &checked); if (sick & ~XFS_SICK_FS_COUNTERS) { trace_xfs_fs_unfixed_corruption(mp, sick); warn = true; } if (warn) { xfs_warn(mp, "Uncorrected metadata errors detected; please run xfs_repair."); /* * We discovered uncorrected metadata problems at some point * during this filesystem mount and have advised the * administrator to run repair once the unmount completes. * * However, we must be careful -- when FSCOUNTERS are flagged * unhealthy, the unmount procedure omits writing the clean * unmount record to the log so that the next mount will run * recovery and recompute the summary counters. In other * words, we leave a dirty log to get the counters fixed. * * Unfortunately, xfs_repair cannot recover dirty logs, so if * there were filesystem problems, FSCOUNTERS was flagged, and * the administrator takes our advice to run xfs_repair, * they'll have to zap the log before repairing structures. * We don't really want to encourage this, so we mark the * FSCOUNTERS healthy so that a subsequent repair run won't see * a dirty log. */ if (sick & XFS_SICK_FS_COUNTERS) xfs_fs_mark_healthy(mp, XFS_SICK_FS_COUNTERS); } } /* Mark unhealthy per-fs metadata. */ void xfs_fs_mark_sick( struct xfs_mount *mp, unsigned int mask) { ASSERT(!(mask & ~XFS_SICK_FS_ALL)); trace_xfs_fs_mark_sick(mp, mask); spin_lock(&mp->m_sb_lock); mp->m_fs_sick |= mask; spin_unlock(&mp->m_sb_lock); } /* Mark per-fs metadata as having been checked and found unhealthy by fsck. */ void xfs_fs_mark_corrupt( struct xfs_mount *mp, unsigned int mask) { ASSERT(!(mask & ~XFS_SICK_FS_ALL)); trace_xfs_fs_mark_corrupt(mp, mask); spin_lock(&mp->m_sb_lock); mp->m_fs_sick |= mask; mp->m_fs_checked |= mask; spin_unlock(&mp->m_sb_lock); } /* Mark a per-fs metadata healed. */ void xfs_fs_mark_healthy( struct xfs_mount *mp, unsigned int mask) { ASSERT(!(mask & ~XFS_SICK_FS_ALL)); trace_xfs_fs_mark_healthy(mp, mask); spin_lock(&mp->m_sb_lock); mp->m_fs_sick &= ~mask; if (!(mp->m_fs_sick & XFS_SICK_FS_PRIMARY)) mp->m_fs_sick &= ~XFS_SICK_FS_SECONDARY; mp->m_fs_checked |= mask; spin_unlock(&mp->m_sb_lock); } /* Sample which per-fs metadata are unhealthy. */ void xfs_fs_measure_sickness( struct xfs_mount *mp, unsigned int *sick, unsigned int *checked) { spin_lock(&mp->m_sb_lock); *sick = mp->m_fs_sick; *checked = mp->m_fs_checked; spin_unlock(&mp->m_sb_lock); } /* Mark unhealthy per-ag metadata given a raw AG number. */ void xfs_agno_mark_sick( struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int mask) { struct xfs_perag *pag = xfs_perag_get(mp, agno); /* per-ag structure not set up yet? */ if (!pag) return; xfs_ag_mark_sick(pag, mask); xfs_perag_put(pag); } static inline void xfs_group_check_mask( struct xfs_group *xg, unsigned int mask) { if (xg->xg_type == XG_TYPE_AG) ASSERT(!(mask & ~XFS_SICK_AG_ALL)); else ASSERT(!(mask & ~XFS_SICK_RG_ALL)); } /* Mark unhealthy per-ag metadata. */ void xfs_group_mark_sick( struct xfs_group *xg, unsigned int mask) { xfs_group_check_mask(xg, mask); trace_xfs_group_mark_sick(xg, mask); spin_lock(&xg->xg_state_lock); xg->xg_sick |= mask; spin_unlock(&xg->xg_state_lock); } /* * Mark per-group metadata as having been checked and found unhealthy by fsck. */ void xfs_group_mark_corrupt( struct xfs_group *xg, unsigned int mask) { xfs_group_check_mask(xg, mask); trace_xfs_group_mark_corrupt(xg, mask); spin_lock(&xg->xg_state_lock); xg->xg_sick |= mask; xg->xg_checked |= mask; spin_unlock(&xg->xg_state_lock); } /* * Mark per-group metadata ok. */ void xfs_group_mark_healthy( struct xfs_group *xg, unsigned int mask) { xfs_group_check_mask(xg, mask); trace_xfs_group_mark_healthy(xg, mask); spin_lock(&xg->xg_state_lock); xg->xg_sick &= ~mask; if (!(xg->xg_sick & XFS_SICK_AG_PRIMARY)) xg->xg_sick &= ~XFS_SICK_AG_SECONDARY; xg->xg_checked |= mask; spin_unlock(&xg->xg_state_lock); } /* Sample which per-ag metadata are unhealthy. */ void xfs_group_measure_sickness( struct xfs_group *xg, unsigned int *sick, unsigned int *checked) { spin_lock(&xg->xg_state_lock); *sick = xg->xg_sick; *checked = xg->xg_checked; spin_unlock(&xg->xg_state_lock); } /* Mark unhealthy per-rtgroup metadata given a raw rt group number. */ void xfs_rgno_mark_sick( struct xfs_mount *mp, xfs_rgnumber_t rgno, unsigned int mask) { struct xfs_rtgroup *rtg = xfs_rtgroup_get(mp, rgno); /* per-rtgroup structure not set up yet? */ if (!rtg) return; xfs_group_mark_sick(rtg_group(rtg), mask); xfs_rtgroup_put(rtg); } /* Mark the unhealthy parts of an inode. */ void xfs_inode_mark_sick( struct xfs_inode *ip, unsigned int mask) { ASSERT(!(mask & ~XFS_SICK_INO_ALL)); trace_xfs_inode_mark_sick(ip, mask); spin_lock(&ip->i_flags_lock); ip->i_sick |= mask; spin_unlock(&ip->i_flags_lock); /* * Keep this inode around so we don't lose the sickness report. Scrub * grabs inodes with DONTCACHE assuming that most inode are ok, which * is not the case here. */ spin_lock(&VFS_I(ip)->i_lock); VFS_I(ip)->i_state &= ~I_DONTCACHE; spin_unlock(&VFS_I(ip)->i_lock); } /* Mark inode metadata as having been checked and found unhealthy by fsck. */ void xfs_inode_mark_corrupt( struct xfs_inode *ip, unsigned int mask) { ASSERT(!(mask & ~XFS_SICK_INO_ALL)); trace_xfs_inode_mark_corrupt(ip, mask); spin_lock(&ip->i_flags_lock); ip->i_sick |= mask; ip->i_checked |= mask; spin_unlock(&ip->i_flags_lock); /* * Keep this inode around so we don't lose the sickness report. Scrub * grabs inodes with DONTCACHE assuming that most inode are ok, which * is not the case here. */ spin_lock(&VFS_I(ip)->i_lock); VFS_I(ip)->i_state &= ~I_DONTCACHE; spin_unlock(&VFS_I(ip)->i_lock); } /* Mark parts of an inode healed. */ void xfs_inode_mark_healthy( struct xfs_inode *ip, unsigned int mask) { ASSERT(!(mask & ~XFS_SICK_INO_ALL)); trace_xfs_inode_mark_healthy(ip, mask); spin_lock(&ip->i_flags_lock); ip->i_sick &= ~mask; if (!(ip->i_sick & XFS_SICK_INO_PRIMARY)) ip->i_sick &= ~XFS_SICK_INO_SECONDARY; ip->i_checked |= mask; spin_unlock(&ip->i_flags_lock); } /* Sample which parts of an inode are unhealthy. */ void xfs_inode_measure_sickness( struct xfs_inode *ip, unsigned int *sick, unsigned int *checked) { spin_lock(&ip->i_flags_lock); *sick = ip->i_sick; *checked = ip->i_checked; spin_unlock(&ip->i_flags_lock); } /* Mappings between internal sick masks and ioctl sick masks. */ struct ioctl_sick_map { unsigned int sick_mask; unsigned int ioctl_mask; }; #define for_each_sick_map(map, m) \ for ((m) = (map); (m) < (map) + ARRAY_SIZE(map); (m)++) static const struct ioctl_sick_map fs_map[] = { { XFS_SICK_FS_COUNTERS, XFS_FSOP_GEOM_SICK_COUNTERS}, { XFS_SICK_FS_UQUOTA, XFS_FSOP_GEOM_SICK_UQUOTA }, { XFS_SICK_FS_GQUOTA, XFS_FSOP_GEOM_SICK_GQUOTA }, { XFS_SICK_FS_PQUOTA, XFS_FSOP_GEOM_SICK_PQUOTA }, { XFS_SICK_FS_QUOTACHECK, XFS_FSOP_GEOM_SICK_QUOTACHECK }, { XFS_SICK_FS_NLINKS, XFS_FSOP_GEOM_SICK_NLINKS }, { XFS_SICK_FS_METADIR, XFS_FSOP_GEOM_SICK_METADIR }, { XFS_SICK_FS_METAPATH, XFS_FSOP_GEOM_SICK_METAPATH }, }; static const struct ioctl_sick_map rt_map[] = { { XFS_SICK_RG_BITMAP, XFS_FSOP_GEOM_SICK_RT_BITMAP }, { XFS_SICK_RG_SUMMARY, XFS_FSOP_GEOM_SICK_RT_SUMMARY }, }; static inline void xfgeo_health_tick( struct xfs_fsop_geom *geo, unsigned int sick, unsigned int checked, const struct ioctl_sick_map *m) { if (checked & m->sick_mask) geo->checked |= m->ioctl_mask; if (sick & m->sick_mask) geo->sick |= m->ioctl_mask; } /* Fill out fs geometry health info. */ void xfs_fsop_geom_health( struct xfs_mount *mp, struct xfs_fsop_geom *geo) { struct xfs_rtgroup *rtg = NULL; const struct ioctl_sick_map *m; unsigned int sick; unsigned int checked; geo->sick = 0; geo->checked = 0; xfs_fs_measure_sickness(mp, &sick, &checked); for_each_sick_map(fs_map, m) xfgeo_health_tick(geo, sick, checked, m); while ((rtg = xfs_rtgroup_next(mp, rtg))) { xfs_group_measure_sickness(rtg_group(rtg), &sick, &checked); for_each_sick_map(rt_map, m) xfgeo_health_tick(geo, sick, checked, m); } } static const struct ioctl_sick_map ag_map[] = { { XFS_SICK_AG_SB, XFS_AG_GEOM_SICK_SB }, { XFS_SICK_AG_AGF, XFS_AG_GEOM_SICK_AGF }, { XFS_SICK_AG_AGFL, XFS_AG_GEOM_SICK_AGFL }, { XFS_SICK_AG_AGI, XFS_AG_GEOM_SICK_AGI }, { XFS_SICK_AG_BNOBT, XFS_AG_GEOM_SICK_BNOBT }, { XFS_SICK_AG_CNTBT, XFS_AG_GEOM_SICK_CNTBT }, { XFS_SICK_AG_INOBT, XFS_AG_GEOM_SICK_INOBT }, { XFS_SICK_AG_FINOBT, XFS_AG_GEOM_SICK_FINOBT }, { XFS_SICK_AG_RMAPBT, XFS_AG_GEOM_SICK_RMAPBT }, { XFS_SICK_AG_REFCNTBT, XFS_AG_GEOM_SICK_REFCNTBT }, { XFS_SICK_AG_INODES, XFS_AG_GEOM_SICK_INODES }, }; /* Fill out ag geometry health info. */ void xfs_ag_geom_health( struct xfs_perag *pag, struct xfs_ag_geometry *ageo) { const struct ioctl_sick_map *m; unsigned int sick; unsigned int checked; ageo->ag_sick = 0; ageo->ag_checked = 0; xfs_group_measure_sickness(pag_group(pag), &sick, &checked); for_each_sick_map(ag_map, m) { if (checked & m->sick_mask) ageo->ag_checked |= m->ioctl_mask; if (sick & m->sick_mask) ageo->ag_sick |= m->ioctl_mask; } } static const struct ioctl_sick_map rtgroup_map[] = { { XFS_SICK_RG_SUPER, XFS_RTGROUP_GEOM_SICK_SUPER }, { XFS_SICK_RG_BITMAP, XFS_RTGROUP_GEOM_SICK_BITMAP }, { XFS_SICK_RG_SUMMARY, XFS_RTGROUP_GEOM_SICK_SUMMARY }, }; /* Fill out rtgroup geometry health info. */ void xfs_rtgroup_geom_health( struct xfs_rtgroup *rtg, struct xfs_rtgroup_geometry *rgeo) { const struct ioctl_sick_map *m; unsigned int sick; unsigned int checked; rgeo->rg_sick = 0; rgeo->rg_checked = 0; xfs_group_measure_sickness(rtg_group(rtg), &sick, &checked); for_each_sick_map(rtgroup_map, m) { if (checked & m->sick_mask) rgeo->rg_checked |= m->ioctl_mask; if (sick & m->sick_mask) rgeo->rg_sick |= m->ioctl_mask; } } static const struct ioctl_sick_map ino_map[] = { { XFS_SICK_INO_CORE, XFS_BS_SICK_INODE }, { XFS_SICK_INO_BMBTD, XFS_BS_SICK_BMBTD }, { XFS_SICK_INO_BMBTA, XFS_BS_SICK_BMBTA }, { XFS_SICK_INO_BMBTC, XFS_BS_SICK_BMBTC }, { XFS_SICK_INO_DIR, XFS_BS_SICK_DIR }, { XFS_SICK_INO_XATTR, XFS_BS_SICK_XATTR }, { XFS_SICK_INO_SYMLINK, XFS_BS_SICK_SYMLINK }, { XFS_SICK_INO_PARENT, XFS_BS_SICK_PARENT }, { XFS_SICK_INO_BMBTD_ZAPPED, XFS_BS_SICK_BMBTD }, { XFS_SICK_INO_BMBTA_ZAPPED, XFS_BS_SICK_BMBTA }, { XFS_SICK_INO_DIR_ZAPPED, XFS_BS_SICK_DIR }, { XFS_SICK_INO_SYMLINK_ZAPPED, XFS_BS_SICK_SYMLINK }, { XFS_SICK_INO_DIRTREE, XFS_BS_SICK_DIRTREE }, }; /* Fill out bulkstat health info. */ void xfs_bulkstat_health( struct xfs_inode *ip, struct xfs_bulkstat *bs) { const struct ioctl_sick_map *m; unsigned int sick; unsigned int checked; bs->bs_sick = 0; bs->bs_checked = 0; xfs_inode_measure_sickness(ip, &sick, &checked); for_each_sick_map(ino_map, m) { if (checked & m->sick_mask) bs->bs_checked |= m->ioctl_mask; if (sick & m->sick_mask) bs->bs_sick |= m->ioctl_mask; } } /* Mark a block mapping sick. */ void xfs_bmap_mark_sick( struct xfs_inode *ip, int whichfork) { unsigned int mask; switch (whichfork) { case XFS_DATA_FORK: mask = XFS_SICK_INO_BMBTD; break; case XFS_ATTR_FORK: mask = XFS_SICK_INO_BMBTA; break; case XFS_COW_FORK: mask = XFS_SICK_INO_BMBTC; break; default: ASSERT(0); return; } xfs_inode_mark_sick(ip, mask); } /* Record observations of btree corruption with the health tracking system. */ void xfs_btree_mark_sick( struct xfs_btree_cur *cur) { if (xfs_btree_is_bmap(cur->bc_ops)) { xfs_bmap_mark_sick(cur->bc_ino.ip, cur->bc_ino.whichfork); /* no health state tracking for ephemeral btrees */ } else if (cur->bc_ops->type != XFS_BTREE_TYPE_MEM) { ASSERT(cur->bc_group); ASSERT(cur->bc_ops->sick_mask); xfs_group_mark_sick(cur->bc_group, cur->bc_ops->sick_mask); } } /* * Record observations of dir/attr btree corruption with the health tracking * system. */ void xfs_dirattr_mark_sick( struct xfs_inode *ip, int whichfork) { unsigned int mask; switch (whichfork) { case XFS_DATA_FORK: mask = XFS_SICK_INO_DIR; break; case XFS_ATTR_FORK: mask = XFS_SICK_INO_XATTR; break; default: ASSERT(0); return; } xfs_inode_mark_sick(ip, mask); } /* * Record observations of dir/attr btree corruption with the health tracking * system. */ void xfs_da_mark_sick( struct xfs_da_args *args) { xfs_dirattr_mark_sick(args->dp, args->whichfork); } |
| 1311 1312 1309 1124 1125 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright 2019 Google LLC */ /* * Refer to Documentation/block/inline-encryption.rst for detailed explanation. */ #define pr_fmt(fmt) "blk-crypto: " fmt #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/blk-crypto-profile.h> #include <linux/module.h> #include <linux/ratelimit.h> #include <linux/slab.h> #include "blk-crypto-internal.h" const struct blk_crypto_mode blk_crypto_modes[] = { [BLK_ENCRYPTION_MODE_AES_256_XTS] = { .name = "AES-256-XTS", .cipher_str = "xts(aes)", .keysize = 64, .ivsize = 16, }, [BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV] = { .name = "AES-128-CBC-ESSIV", .cipher_str = "essiv(cbc(aes),sha256)", .keysize = 16, .ivsize = 16, }, [BLK_ENCRYPTION_MODE_ADIANTUM] = { .name = "Adiantum", .cipher_str = "adiantum(xchacha12,aes)", .keysize = 32, .ivsize = 32, }, [BLK_ENCRYPTION_MODE_SM4_XTS] = { .name = "SM4-XTS", .cipher_str = "xts(sm4)", .keysize = 32, .ivsize = 16, }, }; /* * This number needs to be at least (the number of threads doing IO * concurrently) * (maximum recursive depth of a bio), so that we don't * deadlock on crypt_ctx allocations. The default is chosen to be the same * as the default number of post read contexts in both EXT4 and F2FS. */ static int num_prealloc_crypt_ctxs = 128; module_param(num_prealloc_crypt_ctxs, int, 0444); MODULE_PARM_DESC(num_prealloc_crypt_ctxs, "Number of bio crypto contexts to preallocate"); static struct kmem_cache *bio_crypt_ctx_cache; static mempool_t *bio_crypt_ctx_pool; static int __init bio_crypt_ctx_init(void) { size_t i; bio_crypt_ctx_cache = KMEM_CACHE(bio_crypt_ctx, 0); if (!bio_crypt_ctx_cache) goto out_no_mem; bio_crypt_ctx_pool = mempool_create_slab_pool(num_prealloc_crypt_ctxs, bio_crypt_ctx_cache); if (!bio_crypt_ctx_pool) goto out_no_mem; /* This is assumed in various places. */ BUILD_BUG_ON(BLK_ENCRYPTION_MODE_INVALID != 0); /* Sanity check that no algorithm exceeds the defined limits. */ for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++) { BUG_ON(blk_crypto_modes[i].keysize > BLK_CRYPTO_MAX_KEY_SIZE); BUG_ON(blk_crypto_modes[i].ivsize > BLK_CRYPTO_MAX_IV_SIZE); } return 0; out_no_mem: panic("Failed to allocate mem for bio crypt ctxs\n"); } subsys_initcall(bio_crypt_ctx_init); void bio_crypt_set_ctx(struct bio *bio, const struct blk_crypto_key *key, const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], gfp_t gfp_mask) { struct bio_crypt_ctx *bc; /* * The caller must use a gfp_mask that contains __GFP_DIRECT_RECLAIM so * that the mempool_alloc() can't fail. */ WARN_ON_ONCE(!(gfp_mask & __GFP_DIRECT_RECLAIM)); bc = mempool_alloc(bio_crypt_ctx_pool, gfp_mask); bc->bc_key = key; memcpy(bc->bc_dun, dun, sizeof(bc->bc_dun)); bio->bi_crypt_context = bc; } void __bio_crypt_free_ctx(struct bio *bio) { mempool_free(bio->bi_crypt_context, bio_crypt_ctx_pool); bio->bi_crypt_context = NULL; } int __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask) { dst->bi_crypt_context = mempool_alloc(bio_crypt_ctx_pool, gfp_mask); if (!dst->bi_crypt_context) return -ENOMEM; *dst->bi_crypt_context = *src->bi_crypt_context; return 0; } /* Increments @dun by @inc, treating @dun as a multi-limb integer. */ void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], unsigned int inc) { int i; for (i = 0; inc && i < BLK_CRYPTO_DUN_ARRAY_SIZE; i++) { dun[i] += inc; /* * If the addition in this limb overflowed, then we need to * carry 1 into the next limb. Else the carry is 0. */ if (dun[i] < inc) inc = 1; else inc = 0; } } void __bio_crypt_advance(struct bio *bio, unsigned int bytes) { struct bio_crypt_ctx *bc = bio->bi_crypt_context; bio_crypt_dun_increment(bc->bc_dun, bytes >> bc->bc_key->data_unit_size_bits); } /* * Returns true if @bc->bc_dun plus @bytes converted to data units is equal to * @next_dun, treating the DUNs as multi-limb integers. */ bool bio_crypt_dun_is_contiguous(const struct bio_crypt_ctx *bc, unsigned int bytes, const u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]) { int i; unsigned int carry = bytes >> bc->bc_key->data_unit_size_bits; for (i = 0; i < BLK_CRYPTO_DUN_ARRAY_SIZE; i++) { if (bc->bc_dun[i] + carry != next_dun[i]) return false; /* * If the addition in this limb overflowed, then we need to * carry 1 into the next limb. Else the carry is 0. */ if ((bc->bc_dun[i] + carry) < carry) carry = 1; else carry = 0; } /* If the DUN wrapped through 0, don't treat it as contiguous. */ return carry == 0; } /* * Checks that two bio crypt contexts are compatible - i.e. that * they are mergeable except for data_unit_num continuity. */ static bool bio_crypt_ctx_compatible(struct bio_crypt_ctx *bc1, struct bio_crypt_ctx *bc2) { if (!bc1) return !bc2; return bc2 && bc1->bc_key == bc2->bc_key; } bool bio_crypt_rq_ctx_compatible(struct request *rq, struct bio *bio) { return bio_crypt_ctx_compatible(rq->crypt_ctx, bio->bi_crypt_context); } /* * Checks that two bio crypt contexts are compatible, and also * that their data_unit_nums are continuous (and can hence be merged) * in the order @bc1 followed by @bc2. */ bool bio_crypt_ctx_mergeable(struct bio_crypt_ctx *bc1, unsigned int bc1_bytes, struct bio_crypt_ctx *bc2) { if (!bio_crypt_ctx_compatible(bc1, bc2)) return false; return !bc1 || bio_crypt_dun_is_contiguous(bc1, bc1_bytes, bc2->bc_dun); } /* Check that all I/O segments are data unit aligned. */ static bool bio_crypt_check_alignment(struct bio *bio) { const unsigned int data_unit_size = bio->bi_crypt_context->bc_key->crypto_cfg.data_unit_size; struct bvec_iter iter; struct bio_vec bv; bio_for_each_segment(bv, bio, iter) { if (!IS_ALIGNED(bv.bv_len | bv.bv_offset, data_unit_size)) return false; } return true; } blk_status_t __blk_crypto_rq_get_keyslot(struct request *rq) { return blk_crypto_get_keyslot(rq->q->crypto_profile, rq->crypt_ctx->bc_key, &rq->crypt_keyslot); } void __blk_crypto_rq_put_keyslot(struct request *rq) { blk_crypto_put_keyslot(rq->crypt_keyslot); rq->crypt_keyslot = NULL; } void __blk_crypto_free_request(struct request *rq) { /* The keyslot, if one was needed, should have been released earlier. */ if (WARN_ON_ONCE(rq->crypt_keyslot)) __blk_crypto_rq_put_keyslot(rq); mempool_free(rq->crypt_ctx, bio_crypt_ctx_pool); rq->crypt_ctx = NULL; } /** * __blk_crypto_bio_prep - Prepare bio for inline encryption * * @bio_ptr: pointer to original bio pointer * * If the bio crypt context provided for the bio is supported by the underlying * device's inline encryption hardware, do nothing. * * Otherwise, try to perform en/decryption for this bio by falling back to the * kernel crypto API. When the crypto API fallback is used for encryption, * blk-crypto may choose to split the bio into 2 - the first one that will * continue to be processed and the second one that will be resubmitted via * submit_bio_noacct. A bounce bio will be allocated to encrypt the contents * of the aforementioned "first one", and *bio_ptr will be updated to this * bounce bio. * * Caller must ensure bio has bio_crypt_ctx. * * Return: true on success; false on error (and bio->bi_status will be set * appropriately, and bio_endio() will have been called so bio * submission should abort). */ bool __blk_crypto_bio_prep(struct bio **bio_ptr) { struct bio *bio = *bio_ptr; const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key; /* Error if bio has no data. */ if (WARN_ON_ONCE(!bio_has_data(bio))) { bio->bi_status = BLK_STS_IOERR; goto fail; } if (!bio_crypt_check_alignment(bio)) { bio->bi_status = BLK_STS_IOERR; goto fail; } /* * Success if device supports the encryption context, or if we succeeded * in falling back to the crypto API. */ if (blk_crypto_config_supported_natively(bio->bi_bdev, &bc_key->crypto_cfg)) return true; if (blk_crypto_fallback_bio_prep(bio_ptr)) return true; fail: bio_endio(*bio_ptr); return false; } int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, gfp_t gfp_mask) { if (!rq->crypt_ctx) { rq->crypt_ctx = mempool_alloc(bio_crypt_ctx_pool, gfp_mask); if (!rq->crypt_ctx) return -ENOMEM; } *rq->crypt_ctx = *bio->bi_crypt_context; return 0; } /** * blk_crypto_init_key() - Prepare a key for use with blk-crypto * @blk_key: Pointer to the blk_crypto_key to initialize. * @raw_key: Pointer to the raw key. Must be the correct length for the chosen * @crypto_mode; see blk_crypto_modes[]. * @crypto_mode: identifier for the encryption algorithm to use * @dun_bytes: number of bytes that will be used to specify the DUN when this * key is used * @data_unit_size: the data unit size to use for en/decryption * * Return: 0 on success, -errno on failure. The caller is responsible for * zeroizing both blk_key and raw_key when done with them. */ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, enum blk_crypto_mode_num crypto_mode, unsigned int dun_bytes, unsigned int data_unit_size) { const struct blk_crypto_mode *mode; memset(blk_key, 0, sizeof(*blk_key)); if (crypto_mode >= ARRAY_SIZE(blk_crypto_modes)) return -EINVAL; mode = &blk_crypto_modes[crypto_mode]; if (mode->keysize == 0) return -EINVAL; if (dun_bytes == 0 || dun_bytes > mode->ivsize) return -EINVAL; if (!is_power_of_2(data_unit_size)) return -EINVAL; blk_key->crypto_cfg.crypto_mode = crypto_mode; blk_key->crypto_cfg.dun_bytes = dun_bytes; blk_key->crypto_cfg.data_unit_size = data_unit_size; blk_key->data_unit_size_bits = ilog2(data_unit_size); blk_key->size = mode->keysize; memcpy(blk_key->raw, raw_key, mode->keysize); return 0; } bool blk_crypto_config_supported_natively(struct block_device *bdev, const struct blk_crypto_config *cfg) { return __blk_crypto_cfg_supported(bdev_get_queue(bdev)->crypto_profile, cfg); } /* * Check if bios with @cfg can be en/decrypted by blk-crypto (i.e. either the * block_device it's submitted to supports inline crypto, or the * blk-crypto-fallback is enabled and supports the cfg). */ bool blk_crypto_config_supported(struct block_device *bdev, const struct blk_crypto_config *cfg) { return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) || blk_crypto_config_supported_natively(bdev, cfg); } /** * blk_crypto_start_using_key() - Start using a blk_crypto_key on a device * @bdev: block device to operate on * @key: A key to use on the device * * Upper layers must call this function to ensure that either the hardware * supports the key's crypto settings, or the crypto API fallback has transforms * for the needed mode allocated and ready to go. This function may allocate * an skcipher, and *should not* be called from the data path, since that might * cause a deadlock * * Return: 0 on success; -ENOPKG if the hardware doesn't support the key and * blk-crypto-fallback is either disabled or the needed algorithm * is disabled in the crypto API; or another -errno code. */ int blk_crypto_start_using_key(struct block_device *bdev, const struct blk_crypto_key *key) { if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg)) return 0; return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode); } /** * blk_crypto_evict_key() - Evict a blk_crypto_key from a block_device * @bdev: a block_device on which I/O using the key may have been done * @key: the key to evict * * For a given block_device, this function removes the given blk_crypto_key from * the keyslot management structures and evicts it from any underlying hardware * keyslot(s) or blk-crypto-fallback keyslot it may have been programmed into. * * Upper layers must call this before freeing the blk_crypto_key. It must be * called for every block_device the key may have been used on. The key must no * longer be in use by any I/O when this function is called. * * Context: May sleep. */ void blk_crypto_evict_key(struct block_device *bdev, const struct blk_crypto_key *key) { struct request_queue *q = bdev_get_queue(bdev); int err; if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg)) err = __blk_crypto_evict_key(q->crypto_profile, key); else err = blk_crypto_fallback_evict_key(key); /* * An error can only occur here if the key failed to be evicted from a * keyslot (due to a hardware or driver issue) or is allegedly still in * use by I/O (due to a kernel bug). Even in these cases, the key is * still unlinked from the keyslot management structures, and the caller * is allowed and expected to free it right away. There's nothing * callers can do to handle errors, so just log them and return void. */ if (err) pr_warn_ratelimited("%pg: error %d evicting key\n", bdev, err); } EXPORT_SYMBOL_GPL(blk_crypto_evict_key); |
| 3 36 38 38 38 38 38 38 38 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 | // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "acl.h" #include "xattr.h" #include <linux/posix_acl.h> static const char * const acl_types[] = { [ACL_USER_OBJ] = "user_obj", [ACL_USER] = "user", [ACL_GROUP_OBJ] = "group_obj", [ACL_GROUP] = "group", [ACL_MASK] = "mask", [ACL_OTHER] = "other", NULL, }; void bch2_acl_to_text(struct printbuf *out, const void *value, size_t size) { const void *p, *end = value + size; if (!value || size < sizeof(bch_acl_header) || ((bch_acl_header *)value)->a_version != cpu_to_le32(BCH_ACL_VERSION)) return; p = value + sizeof(bch_acl_header); while (p < end) { const bch_acl_entry *in = p; unsigned tag = le16_to_cpu(in->e_tag); prt_str(out, acl_types[tag]); switch (tag) { case ACL_USER_OBJ: case ACL_GROUP_OBJ: case ACL_MASK: case ACL_OTHER: p += sizeof(bch_acl_entry_short); break; case ACL_USER: prt_printf(out, " uid %u", le32_to_cpu(in->e_id)); p += sizeof(bch_acl_entry); break; case ACL_GROUP: prt_printf(out, " gid %u", le32_to_cpu(in->e_id)); p += sizeof(bch_acl_entry); break; } prt_printf(out, " %o", le16_to_cpu(in->e_perm)); if (p != end) prt_char(out, ' '); } } #ifdef CONFIG_BCACHEFS_POSIX_ACL #include "fs.h" #include <linux/fs.h> #include <linux/posix_acl_xattr.h> #include <linux/sched.h> #include <linux/slab.h> static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long) { return sizeof(bch_acl_header) + sizeof(bch_acl_entry_short) * nr_short + sizeof(bch_acl_entry) * nr_long; } static inline int acl_to_xattr_type(int type) { switch (type) { case ACL_TYPE_ACCESS: return KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS; case ACL_TYPE_DEFAULT: return KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT; default: BUG(); } } /* * Convert from filesystem to in-memory representation. */ static struct posix_acl *bch2_acl_from_disk(struct btree_trans *trans, const void *value, size_t size) { const void *p, *end = value + size; struct posix_acl *acl; struct posix_acl_entry *out; unsigned count = 0; int ret; if (!value) return NULL; if (size < sizeof(bch_acl_header)) goto invalid; if (((bch_acl_header *)value)->a_version != cpu_to_le32(BCH_ACL_VERSION)) goto invalid; p = value + sizeof(bch_acl_header); while (p < end) { const bch_acl_entry *entry = p; if (p + sizeof(bch_acl_entry_short) > end) goto invalid; switch (le16_to_cpu(entry->e_tag)) { case ACL_USER_OBJ: case ACL_GROUP_OBJ: case ACL_MASK: case ACL_OTHER: p += sizeof(bch_acl_entry_short); break; case ACL_USER: case ACL_GROUP: p += sizeof(bch_acl_entry); break; default: goto invalid; } count++; } if (p > end) goto invalid; if (!count) return NULL; acl = allocate_dropping_locks(trans, ret, posix_acl_alloc(count, _gfp)); if (!acl) return ERR_PTR(-ENOMEM); if (ret) { kfree(acl); return ERR_PTR(ret); } out = acl->a_entries; p = value + sizeof(bch_acl_header); while (p < end) { const bch_acl_entry *in = p; out->e_tag = le16_to_cpu(in->e_tag); out->e_perm = le16_to_cpu(in->e_perm); switch (out->e_tag) { case ACL_USER_OBJ: case ACL_GROUP_OBJ: case ACL_MASK: case ACL_OTHER: p += sizeof(bch_acl_entry_short); break; case ACL_USER: out->e_uid = make_kuid(&init_user_ns, le32_to_cpu(in->e_id)); p += sizeof(bch_acl_entry); break; case ACL_GROUP: out->e_gid = make_kgid(&init_user_ns, le32_to_cpu(in->e_id)); p += sizeof(bch_acl_entry); break; } out++; } BUG_ON(out != acl->a_entries + acl->a_count); return acl; invalid: pr_err("invalid acl entry"); return ERR_PTR(-EINVAL); } #define acl_for_each_entry(acl, acl_e) \ for (acl_e = acl->a_entries; \ acl_e < acl->a_entries + acl->a_count; \ acl_e++) /* * Convert from in-memory to filesystem representation. */ static struct bkey_i_xattr * bch2_acl_to_xattr(struct btree_trans *trans, const struct posix_acl *acl, int type) { struct bkey_i_xattr *xattr; bch_acl_header *acl_header; const struct posix_acl_entry *acl_e; void *outptr; unsigned nr_short = 0, nr_long = 0, acl_len, u64s; acl_for_each_entry(acl, acl_e) { switch (acl_e->e_tag) { case ACL_USER: case ACL_GROUP: nr_long++; break; case ACL_USER_OBJ: case ACL_GROUP_OBJ: case ACL_MASK: case ACL_OTHER: nr_short++; break; default: return ERR_PTR(-EINVAL); } } acl_len = bch2_acl_size(nr_short, nr_long); u64s = BKEY_U64s + xattr_val_u64s(0, acl_len); if (u64s > U8_MAX) return ERR_PTR(-E2BIG); xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); if (IS_ERR(xattr)) return xattr; bkey_xattr_init(&xattr->k_i); xattr->k.u64s = u64s; xattr->v.x_type = acl_to_xattr_type(type); xattr->v.x_name_len = 0; xattr->v.x_val_len = cpu_to_le16(acl_len); acl_header = xattr_val(&xattr->v); acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION); outptr = (void *) acl_header + sizeof(*acl_header); acl_for_each_entry(acl, acl_e) { bch_acl_entry *entry = outptr; entry->e_tag = cpu_to_le16(acl_e->e_tag); entry->e_perm = cpu_to_le16(acl_e->e_perm); switch (acl_e->e_tag) { case ACL_USER: entry->e_id = cpu_to_le32( from_kuid(&init_user_ns, acl_e->e_uid)); outptr += sizeof(bch_acl_entry); break; case ACL_GROUP: entry->e_id = cpu_to_le32( from_kgid(&init_user_ns, acl_e->e_gid)); outptr += sizeof(bch_acl_entry); break; case ACL_USER_OBJ: case ACL_GROUP_OBJ: case ACL_MASK: case ACL_OTHER: outptr += sizeof(bch_acl_entry_short); break; } } BUG_ON(outptr != xattr_val(&xattr->v) + acl_len); return xattr; } struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu) { struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0); struct btree_iter iter = { NULL }; struct posix_acl *acl = NULL; if (rcu) return ERR_PTR(-ECHILD); struct btree_trans *trans = bch2_trans_get(c); retry: bch2_trans_begin(trans); struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash, inode_inum(inode), &search, 0); int ret = bkey_err(k); if (ret) goto err; struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); acl = bch2_acl_from_disk(trans, xattr_val(xattr.v), le16_to_cpu(xattr.v->x_val_len)); ret = PTR_ERR_OR_ZERO(acl); err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; if (ret) acl = !bch2_err_matches(ret, ENOENT) ? ERR_PTR(ret) : NULL; if (!IS_ERR_OR_NULL(acl)) set_cached_acl(&inode->v, type, acl); bch2_trans_iter_exit(trans, &iter); bch2_trans_put(trans); return acl; } int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum, struct bch_inode_unpacked *inode_u, struct posix_acl *acl, int type) { struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode_u); int ret; if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode_u->bi_mode)) return acl ? -EACCES : 0; if (acl) { struct bkey_i_xattr *xattr = bch2_acl_to_xattr(trans, acl, type); if (IS_ERR(xattr)) return PTR_ERR(xattr); ret = bch2_hash_set(trans, bch2_xattr_hash_desc, &hash_info, inum, &xattr->k_i, 0); } else { struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0); ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, &hash_info, inum, &search); } return bch2_err_matches(ret, ENOENT) ? 0 : ret; } int bch2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *_acl, int type) { struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct btree_iter inode_iter = { NULL }; struct bch_inode_unpacked inode_u; struct posix_acl *acl; umode_t mode; int ret; mutex_lock(&inode->ei_update_lock); struct btree_trans *trans = bch2_trans_get(c); retry: bch2_trans_begin(trans); acl = _acl; ret = bch2_subvol_is_ro_trans(trans, inode->ei_inum.subvol) ?: bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode), BTREE_ITER_intent); if (ret) goto btree_err; mode = inode_u.bi_mode; if (type == ACL_TYPE_ACCESS) { ret = posix_acl_update_mode(idmap, &inode->v, &mode, &acl); if (ret) goto btree_err; } ret = bch2_set_acl_trans(trans, inode_inum(inode), &inode_u, acl, type); if (ret) goto btree_err; inode_u.bi_ctime = bch2_current_time(c); inode_u.bi_mode = mode; ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?: bch2_trans_commit(trans, NULL, NULL, 0); btree_err: bch2_trans_iter_exit(trans, &inode_iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; if (unlikely(ret)) goto err; bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME|ATTR_MODE); set_cached_acl(&inode->v, type, acl); err: bch2_trans_put(trans); mutex_unlock(&inode->ei_update_lock); return ret; } int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, struct bch_inode_unpacked *inode, umode_t mode, struct posix_acl **new_acl) { struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode); struct xattr_search_key search = X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0); struct btree_iter iter; struct posix_acl *acl = NULL; struct bkey_s_c k = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash_info, inum, &search, BTREE_ITER_intent); int ret = bkey_err(k); if (ret) return bch2_err_matches(ret, ENOENT) ? 0 : ret; struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); acl = bch2_acl_from_disk(trans, xattr_val(xattr.v), le16_to_cpu(xattr.v->x_val_len)); ret = PTR_ERR_OR_ZERO(acl); if (ret) goto err; ret = allocate_dropping_locks_errcode(trans, __posix_acl_chmod(&acl, _gfp, mode)); if (ret) goto err; struct bkey_i_xattr *new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS); ret = PTR_ERR_OR_ZERO(new); if (ret) goto err; new->k.p = iter.pos; ret = bch2_trans_update(trans, &iter, &new->k_i, 0); *new_acl = acl; acl = NULL; err: bch2_trans_iter_exit(trans, &iter); if (!IS_ERR_OR_NULL(acl)) kfree(acl); return ret; } #endif /* CONFIG_BCACHEFS_POSIX_ACL */ |
| 3 3 3 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 | // SPDX-License-Identifier: GPL-2.0-only /* * vivid-kthread-cap.h - video/vbi capture thread support functions. * * Copyright 2014 Cisco Systems, Inc. and/or its affiliates. All rights reserved. */ #include <linux/module.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/font.h> #include <linux/mutex.h> #include <linux/videodev2.h> #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/random.h> #include <linux/v4l2-dv-timings.h> #include <linux/jiffies.h> #include <asm/div64.h> #include <media/videobuf2-vmalloc.h> #include <media/v4l2-dv-timings.h> #include <media/v4l2-ioctl.h> #include <media/v4l2-fh.h> #include <media/v4l2-event.h> #include <media/v4l2-rect.h> #include "vivid-core.h" #include "vivid-vid-common.h" #include "vivid-vid-cap.h" #include "vivid-vid-out.h" #include "vivid-radio-common.h" #include "vivid-radio-rx.h" #include "vivid-radio-tx.h" #include "vivid-sdr-cap.h" #include "vivid-vbi-cap.h" #include "vivid-vbi-out.h" #include "vivid-osd.h" #include "vivid-ctrls.h" #include "vivid-kthread-cap.h" #include "vivid-meta-cap.h" static inline v4l2_std_id vivid_get_std_cap(const struct vivid_dev *dev) { if (vivid_is_sdtv_cap(dev)) return dev->std_cap[dev->input]; return 0; } static void copy_pix(struct vivid_dev *dev, int win_y, int win_x, u16 *cap, const u16 *osd) { u16 out; out = *cap; *cap = *osd; if ((dev->fbuf_out_flags & V4L2_FBUF_FLAG_CHROMAKEY) && *osd != dev->chromakey_out) return; if ((dev->fbuf_out_flags & V4L2_FBUF_FLAG_SRC_CHROMAKEY) && out == dev->chromakey_out) return; if (dev->fmt_cap->alpha_mask) { if ((dev->fbuf_out_flags & V4L2_FBUF_FLAG_GLOBAL_ALPHA) && dev->global_alpha_out) return; if ((dev->fbuf_out_flags & V4L2_FBUF_FLAG_LOCAL_ALPHA) && *cap & dev->fmt_cap->alpha_mask) return; if ((dev->fbuf_out_flags & V4L2_FBUF_FLAG_LOCAL_INV_ALPHA) && !(*cap & dev->fmt_cap->alpha_mask)) return; } *cap = out; } static void blend_line(struct vivid_dev *dev, unsigned y_offset, unsigned x_offset, u8 *vcapbuf, const u8 *vosdbuf, unsigned width, unsigned pixsize) { unsigned x; for (x = 0; x < width; x++, vcapbuf += pixsize, vosdbuf += pixsize) { copy_pix(dev, y_offset, x_offset + x, (u16 *)vcapbuf, (const u16 *)vosdbuf); } } static void scale_line(const u8 *src, u8 *dst, unsigned srcw, unsigned dstw, unsigned twopixsize) { /* Coarse scaling with Bresenham */ unsigned int_part; unsigned fract_part; unsigned src_x = 0; unsigned error = 0; unsigned x; /* * We always combine two pixels to prevent color bleed in the packed * yuv case. */ srcw /= 2; dstw /= 2; int_part = srcw / dstw; fract_part = srcw % dstw; for (x = 0; x < dstw; x++, dst += twopixsize) { memcpy(dst, src + src_x * twopixsize, twopixsize); src_x += int_part; error += fract_part; if (error >= dstw) { error -= dstw; src_x++; } } } /* * Precalculate the rectangles needed to perform video looping: * * The nominal pipeline is that the video output buffer is cropped by * crop_out, scaled to compose_out, overlaid with the output overlay, * cropped on the capture side by crop_cap and scaled again to the video * capture buffer using compose_cap. * * To keep things efficient we calculate the intersection of compose_out * and crop_cap (since that's the only part of the video that will * actually end up in the capture buffer), determine which part of the * video output buffer that is and which part of the video capture buffer * so we can scale the video straight from the output buffer to the capture * buffer without any intermediate steps. * * If we need to deal with an output overlay, then there is no choice and * that intermediate step still has to be taken. For the output overlay * support we calculate the intersection of the framebuffer and the overlay * window (which may be partially or wholly outside of the framebuffer * itself) and the intersection of that with loop_vid_copy (i.e. the part of * the actual looped video that will be overlaid). The result is calculated * both in framebuffer coordinates (loop_fb_copy) and compose_out coordinates * (loop_vid_overlay). Finally calculate the part of the capture buffer that * will receive that overlaid video. */ static void vivid_precalc_copy_rects(struct vivid_dev *dev, struct vivid_dev *out_dev) { /* Framebuffer rectangle */ struct v4l2_rect r_fb = { 0, 0, dev->display_width, dev->display_height }; /* Overlay window rectangle in framebuffer coordinates */ struct v4l2_rect r_overlay = { out_dev->overlay_out_left, out_dev->overlay_out_top, out_dev->compose_out.width, out_dev->compose_out.height }; v4l2_rect_intersect(&dev->loop_vid_copy, &dev->crop_cap, &out_dev->compose_out); dev->loop_vid_out = dev->loop_vid_copy; v4l2_rect_scale(&dev->loop_vid_out, &out_dev->compose_out, &out_dev->crop_out); dev->loop_vid_out.left += out_dev->crop_out.left; dev->loop_vid_out.top += out_dev->crop_out.top; dev->loop_vid_cap = dev->loop_vid_copy; v4l2_rect_scale(&dev->loop_vid_cap, &dev->crop_cap, &dev->compose_cap); dprintk(dev, 1, "loop_vid_copy: %dx%d@%dx%d loop_vid_out: %dx%d@%dx%d loop_vid_cap: %dx%d@%dx%d\n", dev->loop_vid_copy.width, dev->loop_vid_copy.height, dev->loop_vid_copy.left, dev->loop_vid_copy.top, dev->loop_vid_out.width, dev->loop_vid_out.height, dev->loop_vid_out.left, dev->loop_vid_out.top, dev->loop_vid_cap.width, dev->loop_vid_cap.height, dev->loop_vid_cap.left, dev->loop_vid_cap.top); v4l2_rect_intersect(&r_overlay, &r_fb, &r_overlay); /* shift r_overlay to the same origin as compose_out */ r_overlay.left += out_dev->compose_out.left - out_dev->overlay_out_left; r_overlay.top += out_dev->compose_out.top - out_dev->overlay_out_top; v4l2_rect_intersect(&dev->loop_vid_overlay, &r_overlay, &dev->loop_vid_copy); dev->loop_fb_copy = dev->loop_vid_overlay; /* shift dev->loop_fb_copy back again to the fb origin */ dev->loop_fb_copy.left -= out_dev->compose_out.left - out_dev->overlay_out_left; dev->loop_fb_copy.top -= out_dev->compose_out.top - out_dev->overlay_out_top; dev->loop_vid_overlay_cap = dev->loop_vid_overlay; v4l2_rect_scale(&dev->loop_vid_overlay_cap, &dev->crop_cap, &dev->compose_cap); dprintk(dev, 1, "loop_fb_copy: %dx%d@%dx%d loop_vid_overlay: %dx%d@%dx%d loop_vid_overlay_cap: %dx%d@%dx%d\n", dev->loop_fb_copy.width, dev->loop_fb_copy.height, dev->loop_fb_copy.left, dev->loop_fb_copy.top, dev->loop_vid_overlay.width, dev->loop_vid_overlay.height, dev->loop_vid_overlay.left, dev->loop_vid_overlay.top, dev->loop_vid_overlay_cap.width, dev->loop_vid_overlay_cap.height, dev->loop_vid_overlay_cap.left, dev->loop_vid_overlay_cap.top); } static void *plane_vaddr(struct tpg_data *tpg, struct vivid_buffer *buf, unsigned p, unsigned bpl[TPG_MAX_PLANES], unsigned h) { unsigned i; void *vbuf; if (p == 0 || tpg_g_buffers(tpg) > 1) return vb2_plane_vaddr(&buf->vb.vb2_buf, p); vbuf = vb2_plane_vaddr(&buf->vb.vb2_buf, 0); for (i = 0; i < p; i++) vbuf += bpl[i] * h / tpg->vdownsampling[i]; return vbuf; } static noinline_for_stack int vivid_copy_buffer(struct vivid_dev *dev, struct vivid_dev *out_dev, unsigned p, u8 *vcapbuf, struct vivid_buffer *vid_cap_buf) { bool blank = dev->must_blank[vid_cap_buf->vb.vb2_buf.index]; struct tpg_data *tpg = &dev->tpg; struct vivid_buffer *vid_out_buf = NULL; unsigned vdiv = out_dev->fmt_out->vdownsampling[p]; unsigned twopixsize = tpg_g_twopixelsize(tpg, p); unsigned img_width = tpg_hdiv(tpg, p, dev->compose_cap.width); unsigned img_height = dev->compose_cap.height; unsigned stride_cap = tpg->bytesperline[p]; unsigned stride_out = out_dev->bytesperline_out[p]; unsigned stride_osd = dev->display_byte_stride; unsigned hmax = (img_height * tpg->perc_fill) / 100; u8 *voutbuf; u8 *vosdbuf = NULL; unsigned y; bool blend = out_dev->fbuf_out_flags; /* Coarse scaling with Bresenham */ unsigned vid_out_int_part; unsigned vid_out_fract_part; unsigned vid_out_y = 0; unsigned vid_out_error = 0; unsigned vid_overlay_int_part = 0; unsigned vid_overlay_fract_part = 0; unsigned vid_overlay_y = 0; unsigned vid_overlay_error = 0; unsigned vid_cap_left = tpg_hdiv(tpg, p, dev->loop_vid_cap.left); unsigned vid_cap_right; bool quick; vid_out_int_part = dev->loop_vid_out.height / dev->loop_vid_cap.height; vid_out_fract_part = dev->loop_vid_out.height % dev->loop_vid_cap.height; if (!list_empty(&out_dev->vid_out_active)) vid_out_buf = list_entry(out_dev->vid_out_active.next, struct vivid_buffer, list); if (vid_out_buf == NULL) return -ENODATA; vid_cap_buf->vb.field = vid_out_buf->vb.field; voutbuf = plane_vaddr(tpg, vid_out_buf, p, out_dev->bytesperline_out, out_dev->fmt_out_rect.height); if (p < out_dev->fmt_out->buffers) voutbuf += vid_out_buf->vb.vb2_buf.planes[p].data_offset; voutbuf += tpg_hdiv(tpg, p, dev->loop_vid_out.left) + (dev->loop_vid_out.top / vdiv) * stride_out; vcapbuf += tpg_hdiv(tpg, p, dev->compose_cap.left) + (dev->compose_cap.top / vdiv) * stride_cap; if (dev->loop_vid_copy.width == 0 || dev->loop_vid_copy.height == 0) { /* * If there is nothing to copy, then just fill the capture window * with black. */ for (y = 0; y < hmax / vdiv; y++, vcapbuf += stride_cap) memcpy(vcapbuf, tpg->black_line[p], img_width); return 0; } if (out_dev->overlay_out_enabled && dev->loop_vid_overlay.width && dev->loop_vid_overlay.height) { vosdbuf = dev->video_vbase; vosdbuf += (dev->loop_fb_copy.left * twopixsize) / 2 + dev->loop_fb_copy.top * stride_osd; vid_overlay_int_part = dev->loop_vid_overlay.height / dev->loop_vid_overlay_cap.height; vid_overlay_fract_part = dev->loop_vid_overlay.height % dev->loop_vid_overlay_cap.height; } vid_cap_right = tpg_hdiv(tpg, p, dev->loop_vid_cap.left + dev->loop_vid_cap.width); /* quick is true if no video scaling is needed */ quick = dev->loop_vid_out.width == dev->loop_vid_cap.width; dev->cur_scaled_line = dev->loop_vid_out.height; for (y = 0; y < hmax; y += vdiv, vcapbuf += stride_cap) { /* osdline is true if this line requires overlay blending */ bool osdline = vosdbuf && y >= dev->loop_vid_overlay_cap.top && y < dev->loop_vid_overlay_cap.top + dev->loop_vid_overlay_cap.height; /* * If this line of the capture buffer doesn't get any video, then * just fill with black. */ if (y < dev->loop_vid_cap.top || y >= dev->loop_vid_cap.top + dev->loop_vid_cap.height) { memcpy(vcapbuf, tpg->black_line[p], img_width); continue; } /* fill the left border with black */ if (dev->loop_vid_cap.left) memcpy(vcapbuf, tpg->black_line[p], vid_cap_left); /* fill the right border with black */ if (vid_cap_right < img_width) memcpy(vcapbuf + vid_cap_right, tpg->black_line[p], img_width - vid_cap_right); if (quick && !osdline) { memcpy(vcapbuf + vid_cap_left, voutbuf + vid_out_y * stride_out, tpg_hdiv(tpg, p, dev->loop_vid_cap.width)); goto update_vid_out_y; } if (dev->cur_scaled_line == vid_out_y) { memcpy(vcapbuf + vid_cap_left, dev->scaled_line, tpg_hdiv(tpg, p, dev->loop_vid_cap.width)); goto update_vid_out_y; } if (!osdline) { scale_line(voutbuf + vid_out_y * stride_out, dev->scaled_line, tpg_hdiv(tpg, p, dev->loop_vid_out.width), tpg_hdiv(tpg, p, dev->loop_vid_cap.width), tpg_g_twopixelsize(tpg, p)); } else { /* * Offset in bytes within loop_vid_copy to the start of the * loop_vid_overlay rectangle. */ unsigned offset = ((dev->loop_vid_overlay.left - dev->loop_vid_copy.left) * twopixsize) / 2; u8 *osd = vosdbuf + vid_overlay_y * stride_osd; scale_line(voutbuf + vid_out_y * stride_out, dev->blended_line, dev->loop_vid_out.width, dev->loop_vid_copy.width, tpg_g_twopixelsize(tpg, p)); if (blend) blend_line(dev, vid_overlay_y + dev->loop_vid_overlay.top, dev->loop_vid_overlay.left, dev->blended_line + offset, osd, dev->loop_vid_overlay.width, twopixsize / 2); else memcpy(dev->blended_line + offset, osd, (dev->loop_vid_overlay.width * twopixsize) / 2); scale_line(dev->blended_line, dev->scaled_line, dev->loop_vid_copy.width, dev->loop_vid_cap.width, tpg_g_twopixelsize(tpg, p)); } dev->cur_scaled_line = vid_out_y; memcpy(vcapbuf + vid_cap_left, dev->scaled_line, tpg_hdiv(tpg, p, dev->loop_vid_cap.width)); update_vid_out_y: if (osdline) { vid_overlay_y += vid_overlay_int_part; vid_overlay_error += vid_overlay_fract_part; if (vid_overlay_error >= dev->loop_vid_overlay_cap.height) { vid_overlay_error -= dev->loop_vid_overlay_cap.height; vid_overlay_y++; } } vid_out_y += vid_out_int_part; vid_out_error += vid_out_fract_part; if (vid_out_error >= dev->loop_vid_cap.height / vdiv) { vid_out_error -= dev->loop_vid_cap.height / vdiv; vid_out_y++; } } if (!blank) return 0; for (; y < img_height; y += vdiv, vcapbuf += stride_cap) memcpy(vcapbuf, tpg->contrast_line[p], img_width); return 0; } static void vivid_fillbuff(struct vivid_dev *dev, struct vivid_buffer *buf) { struct vivid_dev *out_dev = NULL; struct tpg_data *tpg = &dev->tpg; unsigned factor = V4L2_FIELD_HAS_T_OR_B(dev->field_cap) ? 2 : 1; unsigned line_height = 16 / factor; bool is_tv = vivid_is_sdtv_cap(dev); bool is_60hz = is_tv && (dev->std_cap[dev->input] & V4L2_STD_525_60); unsigned p; int line = 1; u8 *basep[TPG_MAX_PLANES][2]; unsigned ms; char str[100]; s32 gain; buf->vb.sequence = dev->vid_cap_seq_count; v4l2_ctrl_s_ctrl(dev->ro_int32, buf->vb.sequence & 0xff); if (dev->field_cap == V4L2_FIELD_ALTERNATE) { /* * 60 Hz standards start with the bottom field, 50 Hz standards * with the top field. So if the 0-based seq_count is even, * then the field is TOP for 50 Hz and BOTTOM for 60 Hz * standards. */ buf->vb.field = ((dev->vid_cap_seq_count & 1) ^ is_60hz) ? V4L2_FIELD_BOTTOM : V4L2_FIELD_TOP; /* * The sequence counter counts frames, not fields. So divide * by two. */ buf->vb.sequence /= 2; } else { buf->vb.field = dev->field_cap; } tpg_s_field(tpg, buf->vb.field, dev->field_cap == V4L2_FIELD_ALTERNATE); tpg_s_perc_fill_blank(tpg, dev->must_blank[buf->vb.vb2_buf.index]); if (vivid_vid_can_loop(dev) && ((vivid_is_svid_cap(dev) && !VIVID_INVALID_SIGNAL(dev->std_signal_mode[dev->input])) || (vivid_is_hdmi_cap(dev) && !VIVID_INVALID_SIGNAL(dev->dv_timings_signal_mode[dev->input])))) { out_dev = vivid_input_is_connected_to(dev); /* * If the vivid instance of the output device is different * from the vivid instance of this input device, then we * must take care to properly serialize the output device to * prevent that the buffer we are copying from is being freed. * * If the output device is part of the same instance, then the * lock is already taken and there is no need to take the mutex. * * The problem with taking the mutex is that you can get * deadlocked if instance A locks instance B and vice versa. * It is not really worth trying to be very smart about this, * so just try to take the lock, and if you can't, then just * set out_dev to NULL and you will end up with a single frame * of Noise (the default test pattern in this case). */ if (out_dev && dev != out_dev && !mutex_trylock(&out_dev->mutex)) out_dev = NULL; } if (out_dev) vivid_precalc_copy_rects(dev, out_dev); for (p = 0; p < tpg_g_planes(tpg); p++) { void *vbuf = plane_vaddr(tpg, buf, p, tpg->bytesperline, tpg->buf_height); /* * The first plane of a multiplanar format has a non-zero * data_offset. This helps testing whether the application * correctly supports non-zero data offsets. */ if (p < tpg_g_buffers(tpg) && dev->fmt_cap->data_offset[p]) { memset(vbuf, dev->fmt_cap->data_offset[p] & 0xff, dev->fmt_cap->data_offset[p]); vbuf += dev->fmt_cap->data_offset[p]; } tpg_calc_text_basep(tpg, basep, p, vbuf); if (!out_dev || vivid_copy_buffer(dev, out_dev, p, vbuf, buf)) tpg_fill_plane_buffer(tpg, vivid_get_std_cap(dev), p, vbuf); } if (out_dev && dev != out_dev) mutex_unlock(&out_dev->mutex); dev->must_blank[buf->vb.vb2_buf.index] = false; /* Updates stream time, only update at the start of a new frame. */ if (dev->field_cap != V4L2_FIELD_ALTERNATE || (dev->vid_cap_seq_count & 1) == 0) dev->ms_vid_cap = jiffies_to_msecs(jiffies - dev->jiffies_vid_cap); ms = dev->ms_vid_cap; if (dev->osd_mode <= 1) { snprintf(str, sizeof(str), " %02d:%02d:%02d:%03d %u%s", (ms / (60 * 60 * 1000)) % 24, (ms / (60 * 1000)) % 60, (ms / 1000) % 60, ms % 1000, buf->vb.sequence, (dev->field_cap == V4L2_FIELD_ALTERNATE) ? (buf->vb.field == V4L2_FIELD_TOP ? " top" : " bottom") : ""); tpg_gen_text(tpg, basep, line++ * line_height, 16, str); } if (dev->osd_mode == 0) { snprintf(str, sizeof(str), " %dx%d, input %d ", dev->src_rect.width, dev->src_rect.height, dev->input); tpg_gen_text(tpg, basep, line++ * line_height, 16, str); gain = v4l2_ctrl_g_ctrl(dev->gain); mutex_lock(dev->ctrl_hdl_user_vid.lock); snprintf(str, sizeof(str), " brightness %3d, contrast %3d, saturation %3d, hue %d ", dev->brightness->cur.val, dev->contrast->cur.val, dev->saturation->cur.val, dev->hue->cur.val); tpg_gen_text(tpg, basep, line++ * line_height, 16, str); snprintf(str, sizeof(str), " autogain %d, gain %3d, alpha 0x%02x ", dev->autogain->cur.val, gain, dev->alpha->cur.val); mutex_unlock(dev->ctrl_hdl_user_vid.lock); tpg_gen_text(tpg, basep, line++ * line_height, 16, str); mutex_lock(dev->ctrl_hdl_user_aud.lock); snprintf(str, sizeof(str), " volume %3d, mute %d ", dev->volume->cur.val, dev->mute->cur.val); mutex_unlock(dev->ctrl_hdl_user_aud.lock); tpg_gen_text(tpg, basep, line++ * line_height, 16, str); mutex_lock(dev->ctrl_hdl_user_gen.lock); snprintf(str, sizeof(str), " int32 %d, ro_int32 %d, int64 %lld, bitmask %08x ", dev->int32->cur.val, dev->ro_int32->cur.val, *dev->int64->p_cur.p_s64, dev->bitmask->cur.val); tpg_gen_text(tpg, basep, line++ * line_height, 16, str); snprintf(str, sizeof(str), " boolean %d, menu %s, string \"%s\" ", dev->boolean->cur.val, dev->menu->qmenu[dev->menu->cur.val], dev->string->p_cur.p_char); tpg_gen_text(tpg, basep, line++ * line_height, 16, str); snprintf(str, sizeof(str), " integer_menu %lld, value %d ", dev->int_menu->qmenu_int[dev->int_menu->cur.val], dev->int_menu->cur.val); mutex_unlock(dev->ctrl_hdl_user_gen.lock); tpg_gen_text(tpg, basep, line++ * line_height, 16, str); if (dev->button_pressed) { dev->button_pressed--; snprintf(str, sizeof(str), " button pressed!"); tpg_gen_text(tpg, basep, line++ * line_height, 16, str); } if (dev->osd[0]) { if (vivid_is_hdmi_cap(dev)) { snprintf(str, sizeof(str), " OSD \"%s\"", dev->osd); tpg_gen_text(tpg, basep, line++ * line_height, 16, str); } if (dev->osd_jiffies && time_is_before_jiffies(dev->osd_jiffies + 5 * HZ)) { dev->osd[0] = 0; dev->osd_jiffies = 0; } } } } static void vivid_cap_update_frame_period(struct vivid_dev *dev) { u64 f_period; f_period = (u64)dev->timeperframe_vid_cap.numerator * 1000000000; if (WARN_ON(dev->timeperframe_vid_cap.denominator == 0)) dev->timeperframe_vid_cap.denominator = 1; do_div(f_period, dev->timeperframe_vid_cap.denominator); if (dev->field_cap == V4L2_FIELD_ALTERNATE) f_period >>= 1; /* * If "End of Frame", then offset the exposure time by 0.9 * of the frame period. */ dev->cap_frame_eof_offset = f_period * 9; do_div(dev->cap_frame_eof_offset, 10); dev->cap_frame_period = f_period; } static noinline_for_stack void vivid_thread_vid_cap_tick(struct vivid_dev *dev, int dropped_bufs) { struct vivid_buffer *vid_cap_buf = NULL; struct vivid_buffer *vbi_cap_buf = NULL; struct vivid_buffer *meta_cap_buf = NULL; u64 f_time = 0; dprintk(dev, 1, "Video Capture Thread Tick\n"); while (dropped_bufs-- > 1) tpg_update_mv_count(&dev->tpg, dev->field_cap == V4L2_FIELD_NONE || dev->field_cap == V4L2_FIELD_ALTERNATE); /* Drop a certain percentage of buffers. */ if (dev->perc_dropped_buffers && get_random_u32_below(100) < dev->perc_dropped_buffers) goto update_mv; spin_lock(&dev->slock); if (!list_empty(&dev->vid_cap_active)) { vid_cap_buf = list_entry(dev->vid_cap_active.next, struct vivid_buffer, list); list_del(&vid_cap_buf->list); } if (!list_empty(&dev->vbi_cap_active)) { if (dev->field_cap != V4L2_FIELD_ALTERNATE || (dev->vbi_cap_seq_count & 1)) { vbi_cap_buf = list_entry(dev->vbi_cap_active.next, struct vivid_buffer, list); list_del(&vbi_cap_buf->list); } } if (!list_empty(&dev->meta_cap_active)) { meta_cap_buf = list_entry(dev->meta_cap_active.next, struct vivid_buffer, list); list_del(&meta_cap_buf->list); } spin_unlock(&dev->slock); if (!vid_cap_buf && !vbi_cap_buf && !meta_cap_buf) goto update_mv; f_time = ktime_get_ns() + dev->time_wrap_offset; if (vid_cap_buf) { v4l2_ctrl_request_setup(vid_cap_buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_vid_cap); /* Fill buffer */ vivid_fillbuff(dev, vid_cap_buf); dprintk(dev, 1, "filled buffer %d\n", vid_cap_buf->vb.vb2_buf.index); v4l2_ctrl_request_complete(vid_cap_buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_vid_cap); vb2_buffer_done(&vid_cap_buf->vb.vb2_buf, dev->dqbuf_error ? VB2_BUF_STATE_ERROR : VB2_BUF_STATE_DONE); dprintk(dev, 2, "vid_cap buffer %d done\n", vid_cap_buf->vb.vb2_buf.index); vid_cap_buf->vb.vb2_buf.timestamp = f_time; if (!dev->tstamp_src_is_soe) vid_cap_buf->vb.vb2_buf.timestamp += dev->cap_frame_eof_offset; } if (vbi_cap_buf) { u64 vbi_period; v4l2_ctrl_request_setup(vbi_cap_buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_vbi_cap); if (vbi_cap_buf->vb.vb2_buf.type == V4L2_BUF_TYPE_SLICED_VBI_CAPTURE) vivid_sliced_vbi_cap_process(dev, vbi_cap_buf); else vivid_raw_vbi_cap_process(dev, vbi_cap_buf); v4l2_ctrl_request_complete(vbi_cap_buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_vbi_cap); vb2_buffer_done(&vbi_cap_buf->vb.vb2_buf, dev->dqbuf_error ? VB2_BUF_STATE_ERROR : VB2_BUF_STATE_DONE); dprintk(dev, 2, "vbi_cap %d done\n", vbi_cap_buf->vb.vb2_buf.index); /* If capturing a VBI, offset by 0.05 */ vbi_period = dev->cap_frame_period * 5; do_div(vbi_period, 100); vbi_cap_buf->vb.vb2_buf.timestamp = f_time + dev->cap_frame_eof_offset + vbi_period; } if (meta_cap_buf) { v4l2_ctrl_request_setup(meta_cap_buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_meta_cap); vivid_meta_cap_fillbuff(dev, meta_cap_buf, f_time); v4l2_ctrl_request_complete(meta_cap_buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_meta_cap); vb2_buffer_done(&meta_cap_buf->vb.vb2_buf, dev->dqbuf_error ? VB2_BUF_STATE_ERROR : VB2_BUF_STATE_DONE); dprintk(dev, 2, "meta_cap %d done\n", meta_cap_buf->vb.vb2_buf.index); meta_cap_buf->vb.vb2_buf.timestamp = f_time + dev->cap_frame_eof_offset; } dev->dqbuf_error = false; update_mv: /* Update the test pattern movement counters */ tpg_update_mv_count(&dev->tpg, dev->field_cap == V4L2_FIELD_NONE || dev->field_cap == V4L2_FIELD_ALTERNATE); } static int vivid_thread_vid_cap(void *data) { struct vivid_dev *dev = data; u64 numerators_since_start; u64 buffers_since_start; u64 next_jiffies_since_start; unsigned long jiffies_since_start; unsigned long cur_jiffies; unsigned wait_jiffies; unsigned numerator; unsigned denominator; int dropped_bufs; dprintk(dev, 1, "Video Capture Thread Start\n"); set_freezable(); /* Resets frame counters */ dev->cap_seq_offset = 0; dev->cap_seq_count = 0; dev->cap_seq_resync = false; dev->jiffies_vid_cap = jiffies; dev->cap_stream_start = ktime_get_ns(); if (dev->time_wrap) dev->time_wrap_offset = dev->time_wrap - dev->cap_stream_start; else dev->time_wrap_offset = 0; vivid_cap_update_frame_period(dev); for (;;) { try_to_freeze(); if (kthread_should_stop()) break; if (!mutex_trylock(&dev->mutex)) { schedule(); continue; } cur_jiffies = jiffies; if (dev->cap_seq_resync) { dev->jiffies_vid_cap = cur_jiffies; dev->cap_seq_offset = dev->cap_seq_count + 1; dev->cap_seq_count = 0; dev->cap_stream_start += dev->cap_frame_period * dev->cap_seq_offset; vivid_cap_update_frame_period(dev); dev->cap_seq_resync = false; } numerator = dev->timeperframe_vid_cap.numerator; denominator = dev->timeperframe_vid_cap.denominator; if (dev->field_cap == V4L2_FIELD_ALTERNATE) denominator *= 2; /* Calculate the number of jiffies since we started streaming */ jiffies_since_start = cur_jiffies - dev->jiffies_vid_cap; /* Get the number of buffers streamed since the start */ buffers_since_start = (u64)jiffies_since_start * denominator + (HZ * numerator) / 2; do_div(buffers_since_start, HZ * numerator); /* * After more than 0xf0000000 (rounded down to a multiple of * 'jiffies-per-day' to ease jiffies_to_msecs calculation) * jiffies have passed since we started streaming reset the * counters and keep track of the sequence offset. */ if (jiffies_since_start > JIFFIES_RESYNC) { dev->jiffies_vid_cap = cur_jiffies; dev->cap_seq_offset = buffers_since_start; buffers_since_start = 0; } dropped_bufs = buffers_since_start + dev->cap_seq_offset - dev->cap_seq_count; dev->cap_seq_count = buffers_since_start + dev->cap_seq_offset; dev->vid_cap_seq_count = dev->cap_seq_count - dev->vid_cap_seq_start; dev->vbi_cap_seq_count = dev->cap_seq_count - dev->vbi_cap_seq_start; dev->meta_cap_seq_count = dev->cap_seq_count - dev->meta_cap_seq_start; vivid_thread_vid_cap_tick(dev, dropped_bufs); /* * Calculate the number of 'numerators' streamed since we started, * including the current buffer. */ numerators_since_start = ++buffers_since_start * numerator; /* And the number of jiffies since we started */ jiffies_since_start = jiffies - dev->jiffies_vid_cap; mutex_unlock(&dev->mutex); /* * Calculate when that next buffer is supposed to start * in jiffies since we started streaming. */ next_jiffies_since_start = numerators_since_start * HZ + denominator / 2; do_div(next_jiffies_since_start, denominator); /* If it is in the past, then just schedule asap */ if (next_jiffies_since_start < jiffies_since_start) next_jiffies_since_start = jiffies_since_start; wait_jiffies = next_jiffies_since_start - jiffies_since_start; while (time_is_after_jiffies(cur_jiffies + wait_jiffies) && !kthread_should_stop()) schedule(); } dprintk(dev, 1, "Video Capture Thread End\n"); return 0; } static void vivid_grab_controls(struct vivid_dev *dev, bool grab) { v4l2_ctrl_grab(dev->ctrl_has_crop_cap, grab); v4l2_ctrl_grab(dev->ctrl_has_compose_cap, grab); v4l2_ctrl_grab(dev->ctrl_has_scaler_cap, grab); } int vivid_start_generating_vid_cap(struct vivid_dev *dev, bool *pstreaming) { dprintk(dev, 1, "%s\n", __func__); if (dev->kthread_vid_cap) { u32 seq_count = dev->cap_seq_count + dev->seq_wrap * 128; if (pstreaming == &dev->vid_cap_streaming) dev->vid_cap_seq_start = seq_count; else if (pstreaming == &dev->vbi_cap_streaming) dev->vbi_cap_seq_start = seq_count; else dev->meta_cap_seq_start = seq_count; *pstreaming = true; return 0; } /* Resets frame counters */ tpg_init_mv_count(&dev->tpg); dev->vid_cap_seq_start = dev->seq_wrap * 128; dev->vbi_cap_seq_start = dev->seq_wrap * 128; dev->meta_cap_seq_start = dev->seq_wrap * 128; dev->kthread_vid_cap = kthread_run(vivid_thread_vid_cap, dev, "%s-vid-cap", dev->v4l2_dev.name); if (IS_ERR(dev->kthread_vid_cap)) { int err = PTR_ERR(dev->kthread_vid_cap); dev->kthread_vid_cap = NULL; v4l2_err(&dev->v4l2_dev, "kernel_thread() failed\n"); return err; } *pstreaming = true; vivid_grab_controls(dev, true); dprintk(dev, 1, "returning from %s\n", __func__); return 0; } void vivid_stop_generating_vid_cap(struct vivid_dev *dev, bool *pstreaming) { dprintk(dev, 1, "%s\n", __func__); if (dev->kthread_vid_cap == NULL) return; *pstreaming = false; if (pstreaming == &dev->vid_cap_streaming) { /* Release all active buffers */ while (!list_empty(&dev->vid_cap_active)) { struct vivid_buffer *buf; buf = list_entry(dev->vid_cap_active.next, struct vivid_buffer, list); list_del(&buf->list); v4l2_ctrl_request_complete(buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_vid_cap); vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_ERROR); dprintk(dev, 2, "vid_cap buffer %d done\n", buf->vb.vb2_buf.index); } } if (pstreaming == &dev->vbi_cap_streaming) { while (!list_empty(&dev->vbi_cap_active)) { struct vivid_buffer *buf; buf = list_entry(dev->vbi_cap_active.next, struct vivid_buffer, list); list_del(&buf->list); v4l2_ctrl_request_complete(buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_vbi_cap); vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_ERROR); dprintk(dev, 2, "vbi_cap buffer %d done\n", buf->vb.vb2_buf.index); } } if (pstreaming == &dev->meta_cap_streaming) { while (!list_empty(&dev->meta_cap_active)) { struct vivid_buffer *buf; buf = list_entry(dev->meta_cap_active.next, struct vivid_buffer, list); list_del(&buf->list); v4l2_ctrl_request_complete(buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_meta_cap); vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_ERROR); dprintk(dev, 2, "meta_cap buffer %d done\n", buf->vb.vb2_buf.index); } } if (dev->vid_cap_streaming || dev->vbi_cap_streaming || dev->meta_cap_streaming) return; /* shutdown control thread */ vivid_grab_controls(dev, false); kthread_stop(dev->kthread_vid_cap); dev->kthread_vid_cap = NULL; } |
| 13 13 11 11 7 7 7 30 9 2 1 4 1 1 3 1 1 1 1 6 4 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ocfs2/ioctl.c * * Copyright (C) 2006 Herbert Poetzl * adapted from Remy Card's ext2/ioctl.c */ #include <linux/fs.h> #include <linux/mount.h> #include <linux/blkdev.h> #include <linux/compat.h> #include <linux/fileattr.h> #include <cluster/masklog.h> #include "ocfs2.h" #include "alloc.h" #include "dlmglue.h" #include "file.h" #include "inode.h" #include "journal.h" #include "ocfs2_fs.h" #include "ioctl.h" #include "resize.h" #include "refcounttree.h" #include "sysfile.h" #include "dir.h" #include "buffer_head_io.h" #include "suballoc.h" #include "move_extents.h" #define o2info_from_user(a, b) \ copy_from_user(&(a), (b), sizeof(a)) #define o2info_to_user(a, b) \ copy_to_user((typeof(a) __user *)b, &(a), sizeof(a)) /* * This is just a best-effort to tell userspace that this request * caused the error. */ static inline void o2info_set_request_error(struct ocfs2_info_request *kreq, struct ocfs2_info_request __user *req) { kreq->ir_flags |= OCFS2_INFO_FL_ERROR; (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags)); } static inline void o2info_set_request_filled(struct ocfs2_info_request *req) { req->ir_flags |= OCFS2_INFO_FL_FILLED; } static inline void o2info_clear_request_filled(struct ocfs2_info_request *req) { req->ir_flags &= ~OCFS2_INFO_FL_FILLED; } static inline int o2info_coherent(struct ocfs2_info_request *req) { return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT)); } int ocfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); unsigned int flags; int status; status = ocfs2_inode_lock(inode, NULL, 0); if (status < 0) { mlog_errno(status); return status; } ocfs2_get_inode_flags(OCFS2_I(inode)); flags = OCFS2_I(inode)->ip_attr; ocfs2_inode_unlock(inode, 0); fileattr_fill_flags(fa, flags & OCFS2_FL_VISIBLE); return status; } int ocfs2_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); unsigned int flags = fa->flags; struct ocfs2_inode_info *ocfs2_inode = OCFS2_I(inode); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); handle_t *handle = NULL; struct buffer_head *bh = NULL; unsigned oldflags; int status; if (fileattr_has_fsx(fa)) return -EOPNOTSUPP; status = ocfs2_inode_lock(inode, &bh, 1); if (status < 0) { mlog_errno(status); goto bail; } if (!S_ISDIR(inode->i_mode)) flags &= ~OCFS2_DIRSYNC_FL; oldflags = ocfs2_inode->ip_attr; flags = flags & OCFS2_FL_MODIFIABLE; flags |= oldflags & ~OCFS2_FL_MODIFIABLE; /* Check already done by VFS, but repeat with ocfs lock */ status = -EPERM; if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL) && !capable(CAP_LINUX_IMMUTABLE)) goto bail_unlock; handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); goto bail_unlock; } ocfs2_inode->ip_attr = flags; ocfs2_set_inode_flags(inode); inode_set_ctime_current(inode); status = ocfs2_mark_inode_dirty(handle, inode, bh); if (status < 0) mlog_errno(status); ocfs2_commit_trans(osb, handle); bail_unlock: ocfs2_inode_unlock(inode, 1); bail: brelse(bh); return status; } static int ocfs2_info_handle_blocksize(struct inode *inode, struct ocfs2_info_request __user *req) { struct ocfs2_info_blocksize oib; if (o2info_from_user(oib, req)) return -EFAULT; oib.ib_blocksize = inode->i_sb->s_blocksize; o2info_set_request_filled(&oib.ib_req); if (o2info_to_user(oib, req)) return -EFAULT; return 0; } static int ocfs2_info_handle_clustersize(struct inode *inode, struct ocfs2_info_request __user *req) { struct ocfs2_info_clustersize oic; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oic, req)) return -EFAULT; oic.ic_clustersize = osb->s_clustersize; o2info_set_request_filled(&oic.ic_req); if (o2info_to_user(oic, req)) return -EFAULT; return 0; } static int ocfs2_info_handle_maxslots(struct inode *inode, struct ocfs2_info_request __user *req) { struct ocfs2_info_maxslots oim; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oim, req)) return -EFAULT; oim.im_max_slots = osb->max_slots; o2info_set_request_filled(&oim.im_req); if (o2info_to_user(oim, req)) return -EFAULT; return 0; } static int ocfs2_info_handle_label(struct inode *inode, struct ocfs2_info_request __user *req) { struct ocfs2_info_label oil; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oil, req)) return -EFAULT; memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN); o2info_set_request_filled(&oil.il_req); if (o2info_to_user(oil, req)) return -EFAULT; return 0; } static int ocfs2_info_handle_uuid(struct inode *inode, struct ocfs2_info_request __user *req) { struct ocfs2_info_uuid oiu; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oiu, req)) return -EFAULT; memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1); o2info_set_request_filled(&oiu.iu_req); if (o2info_to_user(oiu, req)) return -EFAULT; return 0; } static int ocfs2_info_handle_fs_features(struct inode *inode, struct ocfs2_info_request __user *req) { struct ocfs2_info_fs_features oif; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oif, req)) return -EFAULT; oif.if_compat_features = osb->s_feature_compat; oif.if_incompat_features = osb->s_feature_incompat; oif.if_ro_compat_features = osb->s_feature_ro_compat; o2info_set_request_filled(&oif.if_req); if (o2info_to_user(oif, req)) return -EFAULT; return 0; } static int ocfs2_info_handle_journal_size(struct inode *inode, struct ocfs2_info_request __user *req) { struct ocfs2_info_journal_size oij; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oij, req)) return -EFAULT; oij.ij_journal_size = i_size_read(osb->journal->j_inode); o2info_set_request_filled(&oij.ij_req); if (o2info_to_user(oij, req)) return -EFAULT; return 0; } static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, struct inode *inode_alloc, u64 blkno, struct ocfs2_info_freeinode *fi, u32 slot) { int status = 0, unlock = 0; struct buffer_head *bh = NULL; struct ocfs2_dinode *dinode_alloc = NULL; if (inode_alloc) inode_lock(inode_alloc); if (inode_alloc && o2info_coherent(&fi->ifi_req)) { status = ocfs2_inode_lock(inode_alloc, &bh, 0); if (status < 0) { mlog_errno(status); goto bail; } unlock = 1; } else { status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh); if (status < 0) { mlog_errno(status); goto bail; } } dinode_alloc = (struct ocfs2_dinode *)bh->b_data; fi->ifi_stat[slot].lfi_total = le32_to_cpu(dinode_alloc->id1.bitmap1.i_total); fi->ifi_stat[slot].lfi_free = le32_to_cpu(dinode_alloc->id1.bitmap1.i_total) - le32_to_cpu(dinode_alloc->id1.bitmap1.i_used); bail: if (unlock) ocfs2_inode_unlock(inode_alloc, 0); if (inode_alloc) inode_unlock(inode_alloc); brelse(bh); return status; } static int ocfs2_info_handle_freeinode(struct inode *inode, struct ocfs2_info_request __user *req) { u32 i; u64 blkno = -1; char namebuf[40]; int status, type = INODE_ALLOC_SYSTEM_INODE; struct ocfs2_info_freeinode *oifi = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct inode *inode_alloc = NULL; oifi = kzalloc(sizeof(struct ocfs2_info_freeinode), GFP_KERNEL); if (!oifi) { status = -ENOMEM; mlog_errno(status); goto out_err; } if (o2info_from_user(*oifi, req)) { status = -EFAULT; goto out_free; } oifi->ifi_slotnum = osb->max_slots; for (i = 0; i < oifi->ifi_slotnum; i++) { if (o2info_coherent(&oifi->ifi_req)) { inode_alloc = ocfs2_get_system_file_inode(osb, type, i); if (!inode_alloc) { mlog(ML_ERROR, "unable to get alloc inode in " "slot %u\n", i); status = -EIO; goto bail; } } else { ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, i); status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, strlen(namebuf), &blkno); if (status < 0) { status = -ENOENT; goto bail; } } status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i); iput(inode_alloc); inode_alloc = NULL; if (status < 0) goto bail; } o2info_set_request_filled(&oifi->ifi_req); if (o2info_to_user(*oifi, req)) { status = -EFAULT; goto out_free; } status = 0; bail: if (status) o2info_set_request_error(&oifi->ifi_req, req); out_free: kfree(oifi); out_err: return status; } static void o2ffg_update_histogram(struct ocfs2_info_free_chunk_list *hist, unsigned int chunksize) { u32 index; index = __ilog2_u32(chunksize); if (index >= OCFS2_INFO_MAX_HIST) index = OCFS2_INFO_MAX_HIST - 1; hist->fc_chunks[index]++; hist->fc_clusters[index] += chunksize; } static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats, unsigned int chunksize) { if (chunksize > stats->ffs_max) stats->ffs_max = chunksize; if (chunksize < stats->ffs_min) stats->ffs_min = chunksize; stats->ffs_avg += chunksize; stats->ffs_free_chunks_real++; } static void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg, unsigned int chunksize) { o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize); o2ffg_update_stats(&(ffg->iff_ffs), chunksize); } static int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb, struct inode *gb_inode, struct ocfs2_dinode *gb_dinode, struct ocfs2_chain_rec *rec, struct ocfs2_info_freefrag *ffg, u32 chunks_in_group) { int status = 0, used; u64 blkno; struct buffer_head *bh = NULL; struct ocfs2_group_desc *bg = NULL; unsigned int max_bits, num_clusters; unsigned int offset = 0, cluster, chunk; unsigned int chunk_free, last_chunksize = 0; if (!le32_to_cpu(rec->c_free)) goto bail; do { if (!bg) blkno = le64_to_cpu(rec->c_blkno); else blkno = le64_to_cpu(bg->bg_next_group); if (bh) { brelse(bh); bh = NULL; } if (o2info_coherent(&ffg->iff_req)) status = ocfs2_read_group_descriptor(gb_inode, gb_dinode, blkno, &bh); else status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh); if (status < 0) { mlog(ML_ERROR, "Can't read the group descriptor # " "%llu from device.", (unsigned long long)blkno); status = -EIO; goto bail; } bg = (struct ocfs2_group_desc *)bh->b_data; if (!le16_to_cpu(bg->bg_free_bits_count)) continue; max_bits = le16_to_cpu(bg->bg_bits); offset = 0; for (chunk = 0; chunk < chunks_in_group; chunk++) { /* * last chunk may be not an entire one. */ if ((offset + ffg->iff_chunksize) > max_bits) num_clusters = max_bits - offset; else num_clusters = ffg->iff_chunksize; chunk_free = 0; for (cluster = 0; cluster < num_clusters; cluster++) { used = ocfs2_test_bit(offset, (unsigned long *)bg->bg_bitmap); /* * - chunk_free counts free clusters in #N chunk. * - last_chunksize records the size(in) clusters * for the last real free chunk being counted. */ if (!used) { last_chunksize++; chunk_free++; } if (used && last_chunksize) { ocfs2_info_update_ffg(ffg, last_chunksize); last_chunksize = 0; } offset++; } if (chunk_free == ffg->iff_chunksize) ffg->iff_ffs.ffs_free_chunks++; } /* * need to update the info for last free chunk. */ if (last_chunksize) ocfs2_info_update_ffg(ffg, last_chunksize); } while (le64_to_cpu(bg->bg_next_group)); bail: brelse(bh); return status; } static int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb, struct inode *gb_inode, u64 blkno, struct ocfs2_info_freefrag *ffg) { u32 chunks_in_group; int status = 0, unlock = 0, i; struct buffer_head *bh = NULL; struct ocfs2_chain_list *cl = NULL; struct ocfs2_chain_rec *rec = NULL; struct ocfs2_dinode *gb_dinode = NULL; if (gb_inode) inode_lock(gb_inode); if (o2info_coherent(&ffg->iff_req)) { status = ocfs2_inode_lock(gb_inode, &bh, 0); if (status < 0) { mlog_errno(status); goto bail; } unlock = 1; } else { status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh); if (status < 0) { mlog_errno(status); goto bail; } } gb_dinode = (struct ocfs2_dinode *)bh->b_data; cl = &(gb_dinode->id2.i_chain); /* * Chunksize(in) clusters from userspace should be * less than clusters in a group. */ if (ffg->iff_chunksize > le16_to_cpu(cl->cl_cpg)) { status = -EINVAL; goto bail; } memset(&ffg->iff_ffs, 0, sizeof(struct ocfs2_info_freefrag_stats)); ffg->iff_ffs.ffs_min = ~0U; ffg->iff_ffs.ffs_clusters = le32_to_cpu(gb_dinode->id1.bitmap1.i_total); ffg->iff_ffs.ffs_free_clusters = ffg->iff_ffs.ffs_clusters - le32_to_cpu(gb_dinode->id1.bitmap1.i_used); chunks_in_group = le16_to_cpu(cl->cl_cpg) / ffg->iff_chunksize + 1; for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) { rec = &(cl->cl_recs[i]); status = ocfs2_info_freefrag_scan_chain(osb, gb_inode, gb_dinode, rec, ffg, chunks_in_group); if (status) goto bail; } if (ffg->iff_ffs.ffs_free_chunks_real) ffg->iff_ffs.ffs_avg = (ffg->iff_ffs.ffs_avg / ffg->iff_ffs.ffs_free_chunks_real); bail: if (unlock) ocfs2_inode_unlock(gb_inode, 0); if (gb_inode) inode_unlock(gb_inode); iput(gb_inode); brelse(bh); return status; } static int ocfs2_info_handle_freefrag(struct inode *inode, struct ocfs2_info_request __user *req) { u64 blkno = -1; char namebuf[40]; int status, type = GLOBAL_BITMAP_SYSTEM_INODE; struct ocfs2_info_freefrag *oiff; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct inode *gb_inode = NULL; oiff = kzalloc(sizeof(struct ocfs2_info_freefrag), GFP_KERNEL); if (!oiff) { status = -ENOMEM; mlog_errno(status); goto out_err; } if (o2info_from_user(*oiff, req)) { status = -EFAULT; goto out_free; } /* * chunksize from userspace should be power of 2. */ if ((oiff->iff_chunksize & (oiff->iff_chunksize - 1)) || (!oiff->iff_chunksize)) { status = -EINVAL; goto bail; } if (o2info_coherent(&oiff->iff_req)) { gb_inode = ocfs2_get_system_file_inode(osb, type, OCFS2_INVALID_SLOT); if (!gb_inode) { mlog(ML_ERROR, "unable to get global_bitmap inode\n"); status = -EIO; goto bail; } } else { ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, OCFS2_INVALID_SLOT); status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, strlen(namebuf), &blkno); if (status < 0) { status = -ENOENT; goto bail; } } status = ocfs2_info_freefrag_scan_bitmap(osb, gb_inode, blkno, oiff); if (status < 0) goto bail; o2info_set_request_filled(&oiff->iff_req); if (o2info_to_user(*oiff, req)) { status = -EFAULT; goto out_free; } status = 0; bail: if (status) o2info_set_request_error(&oiff->iff_req, req); out_free: kfree(oiff); out_err: return status; } static int ocfs2_info_handle_unknown(struct inode *inode, struct ocfs2_info_request __user *req) { struct ocfs2_info_request oir; if (o2info_from_user(oir, req)) return -EFAULT; o2info_clear_request_filled(&oir); if (o2info_to_user(oir, req)) return -EFAULT; return 0; } /* * Validate and distinguish OCFS2_IOC_INFO requests. * * - validate the magic number. * - distinguish different requests. * - validate size of different requests. */ static int ocfs2_info_handle_request(struct inode *inode, struct ocfs2_info_request __user *req) { int status = -EFAULT; struct ocfs2_info_request oir; if (o2info_from_user(oir, req)) goto bail; status = -EINVAL; if (oir.ir_magic != OCFS2_INFO_MAGIC) goto bail; switch (oir.ir_code) { case OCFS2_INFO_BLOCKSIZE: if (oir.ir_size == sizeof(struct ocfs2_info_blocksize)) status = ocfs2_info_handle_blocksize(inode, req); break; case OCFS2_INFO_CLUSTERSIZE: if (oir.ir_size == sizeof(struct ocfs2_info_clustersize)) status = ocfs2_info_handle_clustersize(inode, req); break; case OCFS2_INFO_MAXSLOTS: if (oir.ir_size == sizeof(struct ocfs2_info_maxslots)) status = ocfs2_info_handle_maxslots(inode, req); break; case OCFS2_INFO_LABEL: if (oir.ir_size == sizeof(struct ocfs2_info_label)) status = ocfs2_info_handle_label(inode, req); break; case OCFS2_INFO_UUID: if (oir.ir_size == sizeof(struct ocfs2_info_uuid)) status = ocfs2_info_handle_uuid(inode, req); break; case OCFS2_INFO_FS_FEATURES: if (oir.ir_size == sizeof(struct ocfs2_info_fs_features)) status = ocfs2_info_handle_fs_features(inode, req); break; case OCFS2_INFO_JOURNAL_SIZE: if (oir.ir_size == sizeof(struct ocfs2_info_journal_size)) status = ocfs2_info_handle_journal_size(inode, req); break; case OCFS2_INFO_FREEINODE: if (oir.ir_size == sizeof(struct ocfs2_info_freeinode)) status = ocfs2_info_handle_freeinode(inode, req); break; case OCFS2_INFO_FREEFRAG: if (oir.ir_size == sizeof(struct ocfs2_info_freefrag)) status = ocfs2_info_handle_freefrag(inode, req); break; default: status = ocfs2_info_handle_unknown(inode, req); break; } bail: return status; } static int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx, u64 *req_addr, int compat_flag) { int status = -EFAULT; u64 __user *bp = NULL; if (compat_flag) { #ifdef CONFIG_COMPAT /* * pointer bp stores the base address of a pointers array, * which collects all addresses of separate request. */ bp = (u64 __user *)(unsigned long)compat_ptr(info->oi_requests); #else BUG(); #endif } else bp = (u64 __user *)(unsigned long)(info->oi_requests); if (o2info_from_user(*req_addr, bp + idx)) goto bail; status = 0; bail: return status; } /* * OCFS2_IOC_INFO handles an array of requests passed from userspace. * * ocfs2_info_handle() recevies a large info aggregation, grab and * validate the request count from header, then break it into small * pieces, later specific handlers can handle them one by one. * * Idea here is to make each separate request small enough to ensure * a better backward&forward compatibility, since a small piece of * request will be less likely to be broken if disk layout get changed. */ static noinline_for_stack int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info, int compat_flag) { int i, status = 0; u64 req_addr; struct ocfs2_info_request __user *reqp; if ((info->oi_count > OCFS2_INFO_MAX_REQUEST) || (!info->oi_requests)) { status = -EINVAL; goto bail; } for (i = 0; i < info->oi_count; i++) { status = ocfs2_get_request_ptr(info, i, &req_addr, compat_flag); if (status) break; reqp = (struct ocfs2_info_request __user *)(unsigned long)req_addr; if (!reqp) { status = -EINVAL; goto bail; } status = ocfs2_info_handle_request(inode, reqp); if (status) break; } bail: return status; } long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); void __user *argp = (void __user *)arg; int status; switch (cmd) { case OCFS2_IOC_RESVSP: case OCFS2_IOC_RESVSP64: case OCFS2_IOC_UNRESVSP: case OCFS2_IOC_UNRESVSP64: { struct ocfs2_space_resv sr; if (copy_from_user(&sr, (int __user *) arg, sizeof(sr))) return -EFAULT; return ocfs2_change_file_space(filp, cmd, &sr); } case OCFS2_IOC_GROUP_EXTEND: { int new_clusters; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; if (get_user(new_clusters, (int __user *)arg)) return -EFAULT; status = mnt_want_write_file(filp); if (status) return status; status = ocfs2_group_extend(inode, new_clusters); mnt_drop_write_file(filp); return status; } case OCFS2_IOC_GROUP_ADD: case OCFS2_IOC_GROUP_ADD64: { struct ocfs2_new_group_input input; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; if (copy_from_user(&input, (int __user *) arg, sizeof(input))) return -EFAULT; status = mnt_want_write_file(filp); if (status) return status; status = ocfs2_group_add(inode, &input); mnt_drop_write_file(filp); return status; } case OCFS2_IOC_REFLINK: { struct reflink_arguments args; const char __user *old_path; const char __user *new_path; bool preserve; if (copy_from_user(&args, argp, sizeof(args))) return -EFAULT; old_path = (const char __user *)(unsigned long)args.old_path; new_path = (const char __user *)(unsigned long)args.new_path; preserve = (args.preserve != 0); return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve); } case OCFS2_IOC_INFO: { struct ocfs2_info info; if (copy_from_user(&info, argp, sizeof(struct ocfs2_info))) return -EFAULT; return ocfs2_info_handle(inode, &info, 0); } case FITRIM: { struct super_block *sb = inode->i_sb; struct fstrim_range range; int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!bdev_max_discard_sectors(sb->s_bdev)) return -EOPNOTSUPP; if (copy_from_user(&range, argp, sizeof(range))) return -EFAULT; range.minlen = max_t(u64, bdev_discard_granularity(sb->s_bdev), range.minlen); ret = ocfs2_trim_fs(sb, &range); if (ret < 0) return ret; if (copy_to_user(argp, &range, sizeof(range))) return -EFAULT; return 0; } case OCFS2_IOC_MOVE_EXT: return ocfs2_ioctl_move_extents(filp, argp); default: return -ENOTTY; } } #ifdef CONFIG_COMPAT long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) { bool preserve; struct reflink_arguments args; struct inode *inode = file_inode(file); struct ocfs2_info info; void __user *argp = (void __user *)arg; switch (cmd) { case OCFS2_IOC_RESVSP: case OCFS2_IOC_RESVSP64: case OCFS2_IOC_UNRESVSP: case OCFS2_IOC_UNRESVSP64: case OCFS2_IOC_GROUP_EXTEND: case OCFS2_IOC_GROUP_ADD: case OCFS2_IOC_GROUP_ADD64: break; case OCFS2_IOC_REFLINK: if (copy_from_user(&args, argp, sizeof(args))) return -EFAULT; preserve = (args.preserve != 0); return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path), compat_ptr(args.new_path), preserve); case OCFS2_IOC_INFO: if (copy_from_user(&info, argp, sizeof(struct ocfs2_info))) return -EFAULT; return ocfs2_info_handle(inode, &info, 1); case FITRIM: case OCFS2_IOC_MOVE_EXT: break; default: return -ENOIOCTLCMD; } return ocfs2_ioctl(file, cmd, arg); } #endif |
| 109 109 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/isofs/util.c */ #include <linux/time.h> #include "isofs.h" /* * We have to convert from a MM/DD/YY format to the Unix ctime format. * We have to take into account leap years and all of that good stuff. * Unfortunately, the kernel does not have the information on hand to * take into account daylight savings time, but it shouldn't matter. * The time stored should be localtime (with or without DST in effect), * and the timezone offset should hold the offset required to get back * to GMT. Thus we should always be correct. */ int iso_date(u8 *p, int flag) { int year, month, day, hour, minute, second, tz; int crtime; year = p[0]; month = p[1]; day = p[2]; hour = p[3]; minute = p[4]; second = p[5]; if (flag == 0) tz = p[6]; /* High sierra has no time zone */ else tz = 0; if (year < 0) { crtime = 0; } else { crtime = mktime64(year+1900, month, day, hour, minute, second); /* sign extend */ if (tz & 0x80) tz |= (-1 << 8); /* * The timezone offset is unreliable on some disks, * so we make a sanity check. In no case is it ever * more than 13 hours from GMT, which is 52*15min. * The time is always stored in localtime with the * timezone offset being what get added to GMT to * get to localtime. Thus we need to subtract the offset * to get to true GMT, which is what we store the time * as internally. On the local system, the user may set * their timezone any way they wish, of course, so GMT * gets converted back to localtime on the receiving * system. * * NOTE: mkisofs in versions prior to mkisofs-1.10 had * the sign wrong on the timezone offset. This has now * been corrected there too, but if you are getting screwy * results this may be the explanation. If enough people * complain, a user configuration option could be added * to add the timezone offset in with the wrong sign * for 'compatibility' with older discs, but I cannot see how * it will matter that much. * * Thanks to kuhlmav@elec.canterbury.ac.nz (Volker Kuhlmann) * for pointing out the sign error. */ if (-52 <= tz && tz <= 52) crtime -= tz * 15 * 60; } return crtime; } |
| 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/affs/amigaffs.c * * (c) 1996 Hans-Joachim Widmaier - Rewritten * * (C) 1993 Ray Burr - Amiga FFS filesystem. * * Please send bug reports to: hjw@zvw.de */ #include <linux/math64.h> #include <linux/iversion.h> #include "affs.h" /* * Functions for accessing Amiga-FFS structures. */ /* Insert a header block bh into the directory dir * caller must hold AFFS_DIR->i_hash_lock! */ int affs_insert_hash(struct inode *dir, struct buffer_head *bh) { struct super_block *sb = dir->i_sb; struct buffer_head *dir_bh; u32 ino, hash_ino; int offset; ino = bh->b_blocknr; offset = affs_hash_name(sb, AFFS_TAIL(sb, bh)->name + 1, AFFS_TAIL(sb, bh)->name[0]); pr_debug("%s(dir=%lu, ino=%d)\n", __func__, dir->i_ino, ino); dir_bh = affs_bread(sb, dir->i_ino); if (!dir_bh) return -EIO; hash_ino = be32_to_cpu(AFFS_HEAD(dir_bh)->table[offset]); while (hash_ino) { affs_brelse(dir_bh); dir_bh = affs_bread(sb, hash_ino); if (!dir_bh) return -EIO; hash_ino = be32_to_cpu(AFFS_TAIL(sb, dir_bh)->hash_chain); } AFFS_TAIL(sb, bh)->parent = cpu_to_be32(dir->i_ino); AFFS_TAIL(sb, bh)->hash_chain = 0; affs_fix_checksum(sb, bh); if (dir->i_ino == dir_bh->b_blocknr) AFFS_HEAD(dir_bh)->table[offset] = cpu_to_be32(ino); else AFFS_TAIL(sb, dir_bh)->hash_chain = cpu_to_be32(ino); affs_adjust_checksum(dir_bh, ino); mark_buffer_dirty_inode(dir_bh, dir); affs_brelse(dir_bh); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); inode_inc_iversion(dir); mark_inode_dirty(dir); return 0; } /* Remove a header block from its directory. * caller must hold AFFS_DIR->i_hash_lock! */ int affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh) { struct super_block *sb; struct buffer_head *bh; u32 rem_ino, hash_ino; __be32 ino; int offset, retval; sb = dir->i_sb; rem_ino = rem_bh->b_blocknr; offset = affs_hash_name(sb, AFFS_TAIL(sb, rem_bh)->name+1, AFFS_TAIL(sb, rem_bh)->name[0]); pr_debug("%s(dir=%lu, ino=%d, hashval=%d)\n", __func__, dir->i_ino, rem_ino, offset); bh = affs_bread(sb, dir->i_ino); if (!bh) return -EIO; retval = -ENOENT; hash_ino = be32_to_cpu(AFFS_HEAD(bh)->table[offset]); while (hash_ino) { if (hash_ino == rem_ino) { ino = AFFS_TAIL(sb, rem_bh)->hash_chain; if (dir->i_ino == bh->b_blocknr) AFFS_HEAD(bh)->table[offset] = ino; else AFFS_TAIL(sb, bh)->hash_chain = ino; affs_adjust_checksum(bh, be32_to_cpu(ino) - hash_ino); mark_buffer_dirty_inode(bh, dir); AFFS_TAIL(sb, rem_bh)->parent = 0; retval = 0; break; } affs_brelse(bh); bh = affs_bread(sb, hash_ino); if (!bh) return -EIO; hash_ino = be32_to_cpu(AFFS_TAIL(sb, bh)->hash_chain); } affs_brelse(bh); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); inode_inc_iversion(dir); mark_inode_dirty(dir); return retval; } static void affs_fix_dcache(struct inode *inode, u32 entry_ino) { struct dentry *dentry; spin_lock(&inode->i_lock); hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { if (entry_ino == (u32)(long)dentry->d_fsdata) { dentry->d_fsdata = (void *)inode->i_ino; break; } } spin_unlock(&inode->i_lock); } /* Remove header from link chain */ static int affs_remove_link(struct dentry *dentry) { struct inode *dir, *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; struct buffer_head *bh, *link_bh = NULL; u32 link_ino, ino; int retval; pr_debug("%s(key=%ld)\n", __func__, inode->i_ino); retval = -EIO; bh = affs_bread(sb, inode->i_ino); if (!bh) goto done; link_ino = (u32)(long)dentry->d_fsdata; if (inode->i_ino == link_ino) { /* we can't remove the head of the link, as its blocknr is still used as ino, * so we remove the block of the first link instead. */ link_ino = be32_to_cpu(AFFS_TAIL(sb, bh)->link_chain); link_bh = affs_bread(sb, link_ino); if (!link_bh) goto done; dir = affs_iget(sb, be32_to_cpu(AFFS_TAIL(sb, link_bh)->parent)); if (IS_ERR(dir)) { retval = PTR_ERR(dir); goto done; } affs_lock_dir(dir); /* * if there's a dentry for that block, make it * refer to inode itself. */ affs_fix_dcache(inode, link_ino); retval = affs_remove_hash(dir, link_bh); if (retval) { affs_unlock_dir(dir); goto done; } mark_buffer_dirty_inode(link_bh, inode); memcpy(AFFS_TAIL(sb, bh)->name, AFFS_TAIL(sb, link_bh)->name, 32); retval = affs_insert_hash(dir, bh); if (retval) { affs_unlock_dir(dir); goto done; } mark_buffer_dirty_inode(bh, inode); affs_unlock_dir(dir); iput(dir); } else { link_bh = affs_bread(sb, link_ino); if (!link_bh) goto done; } while ((ino = be32_to_cpu(AFFS_TAIL(sb, bh)->link_chain)) != 0) { if (ino == link_ino) { __be32 ino2 = AFFS_TAIL(sb, link_bh)->link_chain; AFFS_TAIL(sb, bh)->link_chain = ino2; affs_adjust_checksum(bh, be32_to_cpu(ino2) - link_ino); mark_buffer_dirty_inode(bh, inode); retval = 0; /* Fix the link count, if bh is a normal header block without links */ switch (be32_to_cpu(AFFS_TAIL(sb, bh)->stype)) { case ST_LINKDIR: case ST_LINKFILE: break; default: if (!AFFS_TAIL(sb, bh)->link_chain) set_nlink(inode, 1); } affs_free_block(sb, link_ino); goto done; } affs_brelse(bh); bh = affs_bread(sb, ino); if (!bh) goto done; } retval = -ENOENT; done: affs_brelse(link_bh); affs_brelse(bh); return retval; } static int affs_empty_dir(struct inode *inode) { struct super_block *sb = inode->i_sb; struct buffer_head *bh; int retval, size; retval = -EIO; bh = affs_bread(sb, inode->i_ino); if (!bh) goto done; retval = -ENOTEMPTY; for (size = AFFS_SB(sb)->s_hashsize - 1; size >= 0; size--) if (AFFS_HEAD(bh)->table[size]) goto not_empty; retval = 0; not_empty: affs_brelse(bh); done: return retval; } /* Remove a filesystem object. If the object to be removed has * links to it, one of the links must be changed to inherit * the file or directory. As above, any inode will do. * The buffer will not be freed. If the header is a link, the * block will be marked as free. * This function returns a negative error number in case of * an error, else 0 if the inode is to be deleted or 1 if not. */ int affs_remove_header(struct dentry *dentry) { struct super_block *sb; struct inode *inode, *dir; struct buffer_head *bh = NULL; int retval; dir = d_inode(dentry->d_parent); sb = dir->i_sb; retval = -ENOENT; inode = d_inode(dentry); if (!inode) goto done; pr_debug("%s(key=%ld)\n", __func__, inode->i_ino); retval = -EIO; bh = affs_bread(sb, (u32)(long)dentry->d_fsdata); if (!bh) goto done; affs_lock_link(inode); affs_lock_dir(dir); switch (be32_to_cpu(AFFS_TAIL(sb, bh)->stype)) { case ST_USERDIR: /* if we ever want to support links to dirs * i_hash_lock of the inode must only be * taken after some checks */ affs_lock_dir(inode); retval = affs_empty_dir(inode); affs_unlock_dir(inode); if (retval) goto done_unlock; break; default: break; } retval = affs_remove_hash(dir, bh); if (retval) goto done_unlock; mark_buffer_dirty_inode(bh, inode); affs_unlock_dir(dir); if (inode->i_nlink > 1) retval = affs_remove_link(dentry); else clear_nlink(inode); affs_unlock_link(inode); inode_set_ctime_current(inode); mark_inode_dirty(inode); done: affs_brelse(bh); return retval; done_unlock: affs_unlock_dir(dir); affs_unlock_link(inode); goto done; } /* Checksum a block, do various consistency checks and optionally return the blocks type number. DATA points to the block. If their pointers are non-null, *PTYPE and *STYPE are set to the primary and secondary block types respectively, *HASHSIZE is set to the size of the hashtable (which lets us calculate the block size). Returns non-zero if the block is not consistent. */ u32 affs_checksum_block(struct super_block *sb, struct buffer_head *bh) { __be32 *ptr = (__be32 *)bh->b_data; u32 sum; int bsize; sum = 0; for (bsize = sb->s_blocksize / sizeof(__be32); bsize > 0; bsize--) sum += be32_to_cpu(*ptr++); return sum; } /* * Calculate the checksum of a disk block and store it * at the indicated position. */ void affs_fix_checksum(struct super_block *sb, struct buffer_head *bh) { int cnt = sb->s_blocksize / sizeof(__be32); __be32 *ptr = (__be32 *)bh->b_data; u32 checksum; __be32 *checksumptr; checksumptr = ptr + 5; *checksumptr = 0; for (checksum = 0; cnt > 0; ptr++, cnt--) checksum += be32_to_cpu(*ptr); *checksumptr = cpu_to_be32(-checksum); } void affs_secs_to_datestamp(time64_t secs, struct affs_date *ds) { u32 days; u32 minute; s32 rem; secs -= sys_tz.tz_minuteswest * 60 + AFFS_EPOCH_DELTA; if (secs < 0) secs = 0; days = div_s64_rem(secs, 86400, &rem); minute = rem / 60; rem -= minute * 60; ds->days = cpu_to_be32(days); ds->mins = cpu_to_be32(minute); ds->ticks = cpu_to_be32(rem * 50); } umode_t affs_prot_to_mode(u32 prot) { umode_t mode = 0; if (!(prot & FIBF_NOWRITE)) mode |= 0200; if (!(prot & FIBF_NOREAD)) mode |= 0400; if (!(prot & FIBF_NOEXECUTE)) mode |= 0100; if (prot & FIBF_GRP_WRITE) mode |= 0020; if (prot & FIBF_GRP_READ) mode |= 0040; if (prot & FIBF_GRP_EXECUTE) mode |= 0010; if (prot & FIBF_OTR_WRITE) mode |= 0002; if (prot & FIBF_OTR_READ) mode |= 0004; if (prot & FIBF_OTR_EXECUTE) mode |= 0001; return mode; } void affs_mode_to_prot(struct inode *inode) { u32 prot = AFFS_I(inode)->i_protect; umode_t mode = inode->i_mode; /* * First, clear all RWED bits for owner, group, other. * Then, recalculate them afresh. * * We'll always clear the delete-inhibit bit for the owner, as that is * the classic single-user mode AmigaOS protection bit and we need to * stay compatible with all scenarios. * * Since multi-user AmigaOS is an extension, we'll only set the * delete-allow bit if any of the other bits in the same user class * (group/other) are used. */ prot &= ~(FIBF_NOEXECUTE | FIBF_NOREAD | FIBF_NOWRITE | FIBF_NODELETE | FIBF_GRP_EXECUTE | FIBF_GRP_READ | FIBF_GRP_WRITE | FIBF_GRP_DELETE | FIBF_OTR_EXECUTE | FIBF_OTR_READ | FIBF_OTR_WRITE | FIBF_OTR_DELETE); /* Classic single-user AmigaOS flags. These are inverted. */ if (!(mode & 0100)) prot |= FIBF_NOEXECUTE; if (!(mode & 0400)) prot |= FIBF_NOREAD; if (!(mode & 0200)) prot |= FIBF_NOWRITE; /* Multi-user extended flags. Not inverted. */ if (mode & 0010) prot |= FIBF_GRP_EXECUTE; if (mode & 0040) prot |= FIBF_GRP_READ; if (mode & 0020) prot |= FIBF_GRP_WRITE; if (mode & 0070) prot |= FIBF_GRP_DELETE; if (mode & 0001) prot |= FIBF_OTR_EXECUTE; if (mode & 0004) prot |= FIBF_OTR_READ; if (mode & 0002) prot |= FIBF_OTR_WRITE; if (mode & 0007) prot |= FIBF_OTR_DELETE; AFFS_I(inode)->i_protect = prot; } void affs_error(struct super_block *sb, const char *function, const char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; pr_crit("error (device %s): %s(): %pV\n", sb->s_id, function, &vaf); if (!sb_rdonly(sb)) pr_warn("Remounting filesystem read-only\n"); sb->s_flags |= SB_RDONLY; va_end(args); } void affs_warning(struct super_block *sb, const char *function, const char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; pr_warn("(device %s): %s(): %pV\n", sb->s_id, function, &vaf); va_end(args); } bool affs_nofilenametruncate(const struct dentry *dentry) { return affs_test_opt(AFFS_SB(dentry->d_sb)->s_flags, SF_NO_TRUNCATE); } /* Check if the name is valid for a affs object. */ int affs_check_name(const unsigned char *name, int len, bool notruncate) { int i; if (len > AFFSNAMEMAX) { if (notruncate) return -ENAMETOOLONG; len = AFFSNAMEMAX; } for (i = 0; i < len; i++) { if (name[i] < ' ' || name[i] == ':' || (name[i] > 0x7e && name[i] < 0xa0)) return -EINVAL; } return 0; } /* This function copies name to bstr, with at most 30 * characters length. The bstr will be prepended by * a length byte. * NOTE: The name will must be already checked by * affs_check_name()! */ int affs_copy_name(unsigned char *bstr, struct dentry *dentry) { u32 len = min(dentry->d_name.len, AFFSNAMEMAX); *bstr++ = len; memcpy(bstr, dentry->d_name.name, len); return len; } |
| 79 79 79 79 79 7 25 72 18 72 38 38 38 38 38 38 6 6 6 2 6 45 8 38 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 | // SPDX-License-Identifier: GPL-2.0 /* * Functions related to generic helpers functions */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/scatterlist.h> #include "blk.h" static sector_t bio_discard_limit(struct block_device *bdev, sector_t sector) { unsigned int discard_granularity = bdev_discard_granularity(bdev); sector_t granularity_aligned_sector; if (bdev_is_partition(bdev)) sector += bdev->bd_start_sect; granularity_aligned_sector = round_up(sector, discard_granularity >> SECTOR_SHIFT); /* * Make sure subsequent bios start aligned to the discard granularity if * it needs to be split. */ if (granularity_aligned_sector != sector) return granularity_aligned_sector - sector; /* * Align the bio size to the discard granularity to make splitting the bio * at discard granularity boundaries easier in the driver if needed. */ return round_down(UINT_MAX, discard_granularity) >> SECTOR_SHIFT; } struct bio *blk_alloc_discard_bio(struct block_device *bdev, sector_t *sector, sector_t *nr_sects, gfp_t gfp_mask) { sector_t bio_sects = min(*nr_sects, bio_discard_limit(bdev, *sector)); struct bio *bio; if (!bio_sects) return NULL; bio = bio_alloc(bdev, 0, REQ_OP_DISCARD, gfp_mask); if (!bio) return NULL; bio->bi_iter.bi_sector = *sector; bio->bi_iter.bi_size = bio_sects << SECTOR_SHIFT; *sector += bio_sects; *nr_sects -= bio_sects; /* * We can loop for a long time in here if someone does full device * discards (like mkfs). Be nice and allow us to schedule out to avoid * softlocking if preempt is disabled. */ cond_resched(); return bio; } int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop) { struct bio *bio; while ((bio = blk_alloc_discard_bio(bdev, §or, &nr_sects, gfp_mask))) *biop = bio_chain_and_submit(*biop, bio); return 0; } EXPORT_SYMBOL(__blkdev_issue_discard); /** * blkdev_issue_discard - queue a discard * @bdev: blockdev to issue discard for * @sector: start sector * @nr_sects: number of sectors to discard * @gfp_mask: memory allocation flags (for bio_alloc) * * Description: * Issue a discard request for the sectors in question. */ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask) { struct bio *bio = NULL; struct blk_plug plug; int ret; blk_start_plug(&plug); ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, &bio); if (!ret && bio) { ret = submit_bio_wait(bio); if (ret == -EOPNOTSUPP) ret = 0; bio_put(bio); } blk_finish_plug(&plug); return ret; } EXPORT_SYMBOL(blkdev_issue_discard); static sector_t bio_write_zeroes_limit(struct block_device *bdev) { sector_t bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; return min(bdev_write_zeroes_sectors(bdev), (UINT_MAX >> SECTOR_SHIFT) & ~bs_mask); } /* * There is no reliable way for the SCSI subsystem to determine whether a * device supports a WRITE SAME operation without actually performing a write * to media. As a result, write_zeroes is enabled by default and will be * disabled if a zeroing operation subsequently fails. This means that this * queue limit is likely to change at runtime. */ static void __blkdev_issue_write_zeroes(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, unsigned flags, sector_t limit) { while (nr_sects) { unsigned int len = min(nr_sects, limit); struct bio *bio; if ((flags & BLKDEV_ZERO_KILLABLE) && fatal_signal_pending(current)) break; bio = bio_alloc(bdev, 0, REQ_OP_WRITE_ZEROES, gfp_mask); bio->bi_iter.bi_sector = sector; if (flags & BLKDEV_ZERO_NOUNMAP) bio->bi_opf |= REQ_NOUNMAP; bio->bi_iter.bi_size = len << SECTOR_SHIFT; *biop = bio_chain_and_submit(*biop, bio); nr_sects -= len; sector += len; cond_resched(); } } static int blkdev_issue_write_zeroes(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp, unsigned flags) { sector_t limit = bio_write_zeroes_limit(bdev); struct bio *bio = NULL; struct blk_plug plug; int ret = 0; blk_start_plug(&plug); __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp, &bio, flags, limit); if (bio) { if ((flags & BLKDEV_ZERO_KILLABLE) && fatal_signal_pending(current)) { bio_await_chain(bio); blk_finish_plug(&plug); return -EINTR; } ret = submit_bio_wait(bio); bio_put(bio); } blk_finish_plug(&plug); /* * For some devices there is no non-destructive way to verify whether * WRITE ZEROES is actually supported. These will clear the capability * on an I/O error, in which case we'll turn any error into * "not supported" here. */ if (ret && !bdev_write_zeroes_sectors(bdev)) return -EOPNOTSUPP; return ret; } /* * Convert a number of 512B sectors to a number of pages. * The result is limited to a number of pages that can fit into a BIO. * Also make sure that the result is always at least 1 (page) for the cases * where nr_sects is lower than the number of sectors in a page. */ static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects) { sector_t pages = DIV_ROUND_UP_SECTOR_T(nr_sects, PAGE_SIZE / 512); return min(pages, (sector_t)BIO_MAX_VECS); } static void __blkdev_issue_zero_pages(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, unsigned int flags) { while (nr_sects) { unsigned int nr_vecs = __blkdev_sectors_to_bio_pages(nr_sects); struct bio *bio; bio = bio_alloc(bdev, nr_vecs, REQ_OP_WRITE, gfp_mask); bio->bi_iter.bi_sector = sector; if ((flags & BLKDEV_ZERO_KILLABLE) && fatal_signal_pending(current)) break; do { unsigned int len, added; len = min_t(sector_t, PAGE_SIZE, nr_sects << SECTOR_SHIFT); added = bio_add_page(bio, ZERO_PAGE(0), len, 0); if (added < len) break; nr_sects -= added >> SECTOR_SHIFT; sector += added >> SECTOR_SHIFT; } while (nr_sects); *biop = bio_chain_and_submit(*biop, bio); cond_resched(); } } static int blkdev_issue_zero_pages(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp, unsigned flags) { struct bio *bio = NULL; struct blk_plug plug; int ret = 0; if (flags & BLKDEV_ZERO_NOFALLBACK) return -EOPNOTSUPP; blk_start_plug(&plug); __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp, &bio, flags); if (bio) { if ((flags & BLKDEV_ZERO_KILLABLE) && fatal_signal_pending(current)) { bio_await_chain(bio); blk_finish_plug(&plug); return -EINTR; } ret = submit_bio_wait(bio); bio_put(bio); } blk_finish_plug(&plug); return ret; } /** * __blkdev_issue_zeroout - generate number of zero filed write bios * @bdev: blockdev to issue * @sector: start sector * @nr_sects: number of sectors to write * @gfp_mask: memory allocation flags (for bio_alloc) * @biop: pointer to anchor bio * @flags: controls detailed behavior * * Description: * Zero-fill a block range, either using hardware offload or by explicitly * writing zeroes to the device. * * If a device is using logical block provisioning, the underlying space will * not be released if %flags contains BLKDEV_ZERO_NOUNMAP. * * If %flags contains BLKDEV_ZERO_NOFALLBACK, the function will return * -EOPNOTSUPP if no explicit hardware offload for zeroing is provided. */ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, unsigned flags) { sector_t limit = bio_write_zeroes_limit(bdev); if (bdev_read_only(bdev)) return -EPERM; if (limit) { __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask, biop, flags, limit); } else { if (flags & BLKDEV_ZERO_NOFALLBACK) return -EOPNOTSUPP; __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp_mask, biop, flags); } return 0; } EXPORT_SYMBOL(__blkdev_issue_zeroout); /** * blkdev_issue_zeroout - zero-fill a block range * @bdev: blockdev to write * @sector: start sector * @nr_sects: number of sectors to write * @gfp_mask: memory allocation flags (for bio_alloc) * @flags: controls detailed behavior * * Description: * Zero-fill a block range, either using hardware offload or by explicitly * writing zeroes to the device. See __blkdev_issue_zeroout() for the * valid values for %flags. */ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, unsigned flags) { int ret; if ((sector | nr_sects) & ((bdev_logical_block_size(bdev) >> 9) - 1)) return -EINVAL; if (bdev_read_only(bdev)) return -EPERM; if (bdev_write_zeroes_sectors(bdev)) { ret = blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask, flags); if (ret != -EOPNOTSUPP) return ret; } return blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp_mask, flags); } EXPORT_SYMBOL(blkdev_issue_zeroout); int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp) { sector_t bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; unsigned int max_sectors = bdev_max_secure_erase_sectors(bdev); struct bio *bio = NULL; struct blk_plug plug; int ret = 0; /* make sure that "len << SECTOR_SHIFT" doesn't overflow */ if (max_sectors > UINT_MAX >> SECTOR_SHIFT) max_sectors = UINT_MAX >> SECTOR_SHIFT; max_sectors &= ~bs_mask; if (max_sectors == 0) return -EOPNOTSUPP; if ((sector | nr_sects) & bs_mask) return -EINVAL; if (bdev_read_only(bdev)) return -EPERM; blk_start_plug(&plug); while (nr_sects) { unsigned int len = min_t(sector_t, nr_sects, max_sectors); bio = blk_next_bio(bio, bdev, 0, REQ_OP_SECURE_ERASE, gfp); bio->bi_iter.bi_sector = sector; bio->bi_iter.bi_size = len << SECTOR_SHIFT; sector += len; nr_sects -= len; cond_resched(); } if (bio) { ret = submit_bio_wait(bio); bio_put(bio); } blk_finish_plug(&plug); return ret; } EXPORT_SYMBOL(blkdev_issue_secure_erase); |
| 1 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 | // SPDX-License-Identifier: GPL-2.0-only /* * sd.c Copyright (C) 1992 Drew Eckhardt * Copyright (C) 1993, 1994, 1995, 1999 Eric Youngdale * * Linux scsi disk driver * Initial versions: Drew Eckhardt * Subsequent revisions: Eric Youngdale * Modification history: * - Drew Eckhardt <drew@colorado.edu> original * - Eric Youngdale <eric@andante.org> add scatter-gather, multiple * outstanding request, and other enhancements. * Support loadable low-level scsi drivers. * - Jirka Hanika <geo@ff.cuni.cz> support more scsi disks using * eight major numbers. * - Richard Gooch <rgooch@atnf.csiro.au> support devfs. * - Torben Mathiasen <tmm@image.dk> Resource allocation fixes in * sd_init and cleanups. * - Alex Davis <letmein@erols.com> Fix problem where partition info * not being read in sd_open. Fix problem where removable media * could be ejected after sd_open. * - Douglas Gilbert <dgilbert@interlog.com> cleanup for lk 2.5.x * - Badari Pulavarty <pbadari@us.ibm.com>, Matthew Wilcox * <willy@debian.org>, Kurt Garloff <garloff@suse.de>: * Support 32k/1M disks. * * Logging policy (needs CONFIG_SCSI_LOGGING defined): * - setting up transfer: SCSI_LOG_HLQUEUE levels 1 and 2 * - end of transfer (bh + scsi_lib): SCSI_LOG_HLCOMPLETE level 1 * - entering sd_ioctl: SCSI_LOG_IOCTL level 1 * - entering other commands: SCSI_LOG_HLQUEUE level 3 * Note: when the logging level is set by the user, it must be greater * than the level indicated above to trigger output. */ #include <linux/bio-integrity.h> #include <linux/module.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/hdreg.h> #include <linux/errno.h> #include <linux/idr.h> #include <linux/interrupt.h> #include <linux/init.h> #include <linux/blkdev.h> #include <linux/blkpg.h> #include <linux/blk-pm.h> #include <linux/delay.h> #include <linux/rw_hint.h> #include <linux/major.h> #include <linux/mutex.h> #include <linux/string_helpers.h> #include <linux/slab.h> #include <linux/sed-opal.h> #include <linux/pm_runtime.h> #include <linux/pr.h> #include <linux/t10-pi.h> #include <linux/uaccess.h> #include <linux/unaligned.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_dbg.h> #include <scsi/scsi_device.h> #include <scsi/scsi_devinfo.h> #include <scsi/scsi_driver.h> #include <scsi/scsi_eh.h> #include <scsi/scsi_host.h> #include <scsi/scsi_ioctl.h> #include <scsi/scsicam.h> #include <scsi/scsi_common.h> #include "sd.h" #include "scsi_priv.h" #include "scsi_logging.h" MODULE_AUTHOR("Eric Youngdale"); MODULE_DESCRIPTION("SCSI disk (sd) driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK0_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK1_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK2_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK3_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK4_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK5_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK6_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK7_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK8_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK9_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK10_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK11_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK12_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK13_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK14_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK15_MAJOR); MODULE_ALIAS_SCSI_DEVICE(TYPE_DISK); MODULE_ALIAS_SCSI_DEVICE(TYPE_MOD); MODULE_ALIAS_SCSI_DEVICE(TYPE_RBC); MODULE_ALIAS_SCSI_DEVICE(TYPE_ZBC); #define SD_MINORS 16 static void sd_config_discard(struct scsi_disk *sdkp, struct queue_limits *lim, unsigned int mode); static void sd_config_write_same(struct scsi_disk *sdkp, struct queue_limits *lim); static int sd_revalidate_disk(struct gendisk *); static void sd_unlock_native_capacity(struct gendisk *disk); static void sd_shutdown(struct device *); static void scsi_disk_release(struct device *cdev); static DEFINE_IDA(sd_index_ida); static mempool_t *sd_page_pool; static struct lock_class_key sd_bio_compl_lkclass; static const char *sd_cache_types[] = { "write through", "none", "write back", "write back, no read (daft)" }; static void sd_set_flush_flag(struct scsi_disk *sdkp, struct queue_limits *lim) { if (sdkp->WCE) { lim->features |= BLK_FEAT_WRITE_CACHE; if (sdkp->DPOFUA) lim->features |= BLK_FEAT_FUA; else lim->features &= ~BLK_FEAT_FUA; } else { lim->features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA); } } static ssize_t cache_type_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int ct, rcd, wce, sp; struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; char buffer[64]; char *buffer_data; struct scsi_mode_data data; struct scsi_sense_hdr sshdr; static const char temp[] = "temporary "; int len, ret; if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC) /* no cache control on RBC devices; theoretically they * can do it, but there's probably so many exceptions * it's not worth the risk */ return -EINVAL; if (strncmp(buf, temp, sizeof(temp) - 1) == 0) { buf += sizeof(temp) - 1; sdkp->cache_override = 1; } else { sdkp->cache_override = 0; } ct = sysfs_match_string(sd_cache_types, buf); if (ct < 0) return -EINVAL; rcd = ct & 0x01 ? 1 : 0; wce = (ct & 0x02) && !sdkp->write_prot ? 1 : 0; if (sdkp->cache_override) { struct queue_limits lim; sdkp->WCE = wce; sdkp->RCD = rcd; lim = queue_limits_start_update(sdkp->disk->queue); sd_set_flush_flag(sdkp, &lim); blk_mq_freeze_queue(sdkp->disk->queue); ret = queue_limits_commit_update(sdkp->disk->queue, &lim); blk_mq_unfreeze_queue(sdkp->disk->queue); if (ret) return ret; return count; } if (scsi_mode_sense(sdp, 0x08, 8, 0, buffer, sizeof(buffer), SD_TIMEOUT, sdkp->max_retries, &data, NULL)) return -EINVAL; len = min_t(size_t, sizeof(buffer), data.length - data.header_length - data.block_descriptor_length); buffer_data = buffer + data.header_length + data.block_descriptor_length; buffer_data[2] &= ~0x05; buffer_data[2] |= wce << 2 | rcd; sp = buffer_data[0] & 0x80 ? 1 : 0; buffer_data[0] &= ~0x80; /* * Ensure WP, DPOFUA, and RESERVED fields are cleared in * received mode parameter buffer before doing MODE SELECT. */ data.device_specific = 0; ret = scsi_mode_select(sdp, 1, sp, buffer_data, len, SD_TIMEOUT, sdkp->max_retries, &data, &sshdr); if (ret) { if (ret > 0 && scsi_sense_valid(&sshdr)) sd_print_sense_hdr(sdkp, &sshdr); return -EINVAL; } sd_revalidate_disk(sdkp->disk); return count; } static ssize_t manage_start_stop_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; return sysfs_emit(buf, "%u\n", sdp->manage_system_start_stop && sdp->manage_runtime_start_stop && sdp->manage_shutdown); } static DEVICE_ATTR_RO(manage_start_stop); static ssize_t manage_system_start_stop_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; return sysfs_emit(buf, "%u\n", sdp->manage_system_start_stop); } static ssize_t manage_system_start_stop_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; bool v; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (kstrtobool(buf, &v)) return -EINVAL; sdp->manage_system_start_stop = v; return count; } static DEVICE_ATTR_RW(manage_system_start_stop); static ssize_t manage_runtime_start_stop_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; return sysfs_emit(buf, "%u\n", sdp->manage_runtime_start_stop); } static ssize_t manage_runtime_start_stop_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; bool v; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (kstrtobool(buf, &v)) return -EINVAL; sdp->manage_runtime_start_stop = v; return count; } static DEVICE_ATTR_RW(manage_runtime_start_stop); static ssize_t manage_shutdown_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; return sysfs_emit(buf, "%u\n", sdp->manage_shutdown); } static ssize_t manage_shutdown_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; bool v; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (kstrtobool(buf, &v)) return -EINVAL; sdp->manage_shutdown = v; return count; } static DEVICE_ATTR_RW(manage_shutdown); static ssize_t allow_restart_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->device->allow_restart); } static ssize_t allow_restart_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { bool v; struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC) return -EINVAL; if (kstrtobool(buf, &v)) return -EINVAL; sdp->allow_restart = v; return count; } static DEVICE_ATTR_RW(allow_restart); static ssize_t cache_type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); int ct = sdkp->RCD + 2*sdkp->WCE; return sprintf(buf, "%s\n", sd_cache_types[ct]); } static DEVICE_ATTR_RW(cache_type); static ssize_t FUA_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->DPOFUA); } static DEVICE_ATTR_RO(FUA); static ssize_t protection_type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->protection_type); } static ssize_t protection_type_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); unsigned int val; int err; if (!capable(CAP_SYS_ADMIN)) return -EACCES; err = kstrtouint(buf, 10, &val); if (err) return err; if (val <= T10_PI_TYPE3_PROTECTION) sdkp->protection_type = val; return count; } static DEVICE_ATTR_RW(protection_type); static ssize_t protection_mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; unsigned int dif, dix; dif = scsi_host_dif_capable(sdp->host, sdkp->protection_type); dix = scsi_host_dix_capable(sdp->host, sdkp->protection_type); if (!dix && scsi_host_dix_capable(sdp->host, T10_PI_TYPE0_PROTECTION)) { dif = 0; dix = 1; } if (!dif && !dix) return sprintf(buf, "none\n"); return sprintf(buf, "%s%u\n", dix ? "dix" : "dif", dif); } static DEVICE_ATTR_RO(protection_mode); static ssize_t app_tag_own_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->ATO); } static DEVICE_ATTR_RO(app_tag_own); static ssize_t thin_provisioning_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->lbpme); } static DEVICE_ATTR_RO(thin_provisioning); /* sysfs_match_string() requires dense arrays */ static const char *lbp_mode[] = { [SD_LBP_FULL] = "full", [SD_LBP_UNMAP] = "unmap", [SD_LBP_WS16] = "writesame_16", [SD_LBP_WS10] = "writesame_10", [SD_LBP_ZERO] = "writesame_zero", [SD_LBP_DISABLE] = "disabled", }; static ssize_t provisioning_mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%s\n", lbp_mode[sdkp->provisioning_mode]); } static ssize_t provisioning_mode_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; struct queue_limits lim; int mode, err; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (sdp->type != TYPE_DISK) return -EINVAL; mode = sysfs_match_string(lbp_mode, buf); if (mode < 0) return -EINVAL; lim = queue_limits_start_update(sdkp->disk->queue); sd_config_discard(sdkp, &lim, mode); blk_mq_freeze_queue(sdkp->disk->queue); err = queue_limits_commit_update(sdkp->disk->queue, &lim); blk_mq_unfreeze_queue(sdkp->disk->queue); if (err) return err; return count; } static DEVICE_ATTR_RW(provisioning_mode); /* sysfs_match_string() requires dense arrays */ static const char *zeroing_mode[] = { [SD_ZERO_WRITE] = "write", [SD_ZERO_WS] = "writesame", [SD_ZERO_WS16_UNMAP] = "writesame_16_unmap", [SD_ZERO_WS10_UNMAP] = "writesame_10_unmap", }; static ssize_t zeroing_mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%s\n", zeroing_mode[sdkp->zeroing_mode]); } static ssize_t zeroing_mode_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); int mode; if (!capable(CAP_SYS_ADMIN)) return -EACCES; mode = sysfs_match_string(zeroing_mode, buf); if (mode < 0) return -EINVAL; sdkp->zeroing_mode = mode; return count; } static DEVICE_ATTR_RW(zeroing_mode); static ssize_t max_medium_access_timeouts_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->max_medium_access_timeouts); } static ssize_t max_medium_access_timeouts_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); int err; if (!capable(CAP_SYS_ADMIN)) return -EACCES; err = kstrtouint(buf, 10, &sdkp->max_medium_access_timeouts); return err ? err : count; } static DEVICE_ATTR_RW(max_medium_access_timeouts); static ssize_t max_write_same_blocks_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->max_ws_blocks); } static ssize_t max_write_same_blocks_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; struct queue_limits lim; unsigned long max; int err; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC) return -EINVAL; err = kstrtoul(buf, 10, &max); if (err) return err; if (max == 0) sdp->no_write_same = 1; else if (max <= SD_MAX_WS16_BLOCKS) { sdp->no_write_same = 0; sdkp->max_ws_blocks = max; } lim = queue_limits_start_update(sdkp->disk->queue); sd_config_write_same(sdkp, &lim); blk_mq_freeze_queue(sdkp->disk->queue); err = queue_limits_commit_update(sdkp->disk->queue, &lim); blk_mq_unfreeze_queue(sdkp->disk->queue); if (err) return err; return count; } static DEVICE_ATTR_RW(max_write_same_blocks); static ssize_t zoned_cap_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); if (sdkp->device->type == TYPE_ZBC) return sprintf(buf, "host-managed\n"); if (sdkp->zoned == 1) return sprintf(buf, "host-aware\n"); if (sdkp->zoned == 2) return sprintf(buf, "drive-managed\n"); return sprintf(buf, "none\n"); } static DEVICE_ATTR_RO(zoned_cap); static ssize_t max_retries_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdev = sdkp->device; int retries, err; err = kstrtoint(buf, 10, &retries); if (err) return err; if (retries == SCSI_CMD_RETRIES_NO_LIMIT || retries <= SD_MAX_RETRIES) { sdkp->max_retries = retries; return count; } sdev_printk(KERN_ERR, sdev, "max_retries must be between -1 and %d\n", SD_MAX_RETRIES); return -EINVAL; } static ssize_t max_retries_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%d\n", sdkp->max_retries); } static DEVICE_ATTR_RW(max_retries); static struct attribute *sd_disk_attrs[] = { &dev_attr_cache_type.attr, &dev_attr_FUA.attr, &dev_attr_allow_restart.attr, &dev_attr_manage_start_stop.attr, &dev_attr_manage_system_start_stop.attr, &dev_attr_manage_runtime_start_stop.attr, &dev_attr_manage_shutdown.attr, &dev_attr_protection_type.attr, &dev_attr_protection_mode.attr, &dev_attr_app_tag_own.attr, &dev_attr_thin_provisioning.attr, &dev_attr_provisioning_mode.attr, &dev_attr_zeroing_mode.attr, &dev_attr_max_write_same_blocks.attr, &dev_attr_max_medium_access_timeouts.attr, &dev_attr_zoned_cap.attr, &dev_attr_max_retries.attr, NULL, }; ATTRIBUTE_GROUPS(sd_disk); static struct class sd_disk_class = { .name = "scsi_disk", .dev_release = scsi_disk_release, .dev_groups = sd_disk_groups, }; /* * Don't request a new module, as that could deadlock in multipath * environment. */ static void sd_default_probe(dev_t devt) { } /* * Device no to disk mapping: * * major disc2 disc p1 * |............|.............|....|....| <- dev_t * 31 20 19 8 7 4 3 0 * * Inside a major, we have 16k disks, however mapped non- * contiguously. The first 16 disks are for major0, the next * ones with major1, ... Disk 256 is for major0 again, disk 272 * for major1, ... * As we stay compatible with our numbering scheme, we can reuse * the well-know SCSI majors 8, 65--71, 136--143. */ static int sd_major(int major_idx) { switch (major_idx) { case 0: return SCSI_DISK0_MAJOR; case 1 ... 7: return SCSI_DISK1_MAJOR + major_idx - 1; case 8 ... 15: return SCSI_DISK8_MAJOR + major_idx - 8; default: BUG(); return 0; /* shut up gcc */ } } #ifdef CONFIG_BLK_SED_OPAL static int sd_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, bool send) { struct scsi_disk *sdkp = data; struct scsi_device *sdev = sdkp->device; u8 cdb[12] = { 0, }; const struct scsi_exec_args exec_args = { .req_flags = BLK_MQ_REQ_PM, }; int ret; cdb[0] = send ? SECURITY_PROTOCOL_OUT : SECURITY_PROTOCOL_IN; cdb[1] = secp; put_unaligned_be16(spsp, &cdb[2]); put_unaligned_be32(len, &cdb[6]); ret = scsi_execute_cmd(sdev, cdb, send ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, buffer, len, SD_TIMEOUT, sdkp->max_retries, &exec_args); return ret <= 0 ? ret : -EIO; } #endif /* CONFIG_BLK_SED_OPAL */ /* * Look up the DIX operation based on whether the command is read or * write and whether dix and dif are enabled. */ static unsigned int sd_prot_op(bool write, bool dix, bool dif) { /* Lookup table: bit 2 (write), bit 1 (dix), bit 0 (dif) */ static const unsigned int ops[] = { /* wrt dix dif */ SCSI_PROT_NORMAL, /* 0 0 0 */ SCSI_PROT_READ_STRIP, /* 0 0 1 */ SCSI_PROT_READ_INSERT, /* 0 1 0 */ SCSI_PROT_READ_PASS, /* 0 1 1 */ SCSI_PROT_NORMAL, /* 1 0 0 */ SCSI_PROT_WRITE_INSERT, /* 1 0 1 */ SCSI_PROT_WRITE_STRIP, /* 1 1 0 */ SCSI_PROT_WRITE_PASS, /* 1 1 1 */ }; return ops[write << 2 | dix << 1 | dif]; } /* * Returns a mask of the protection flags that are valid for a given DIX * operation. */ static unsigned int sd_prot_flag_mask(unsigned int prot_op) { static const unsigned int flag_mask[] = { [SCSI_PROT_NORMAL] = 0, [SCSI_PROT_READ_STRIP] = SCSI_PROT_TRANSFER_PI | SCSI_PROT_GUARD_CHECK | SCSI_PROT_REF_CHECK | SCSI_PROT_REF_INCREMENT, [SCSI_PROT_READ_INSERT] = SCSI_PROT_REF_INCREMENT | SCSI_PROT_IP_CHECKSUM, [SCSI_PROT_READ_PASS] = SCSI_PROT_TRANSFER_PI | SCSI_PROT_GUARD_CHECK | SCSI_PROT_REF_CHECK | SCSI_PROT_REF_INCREMENT | SCSI_PROT_IP_CHECKSUM, [SCSI_PROT_WRITE_INSERT] = SCSI_PROT_TRANSFER_PI | SCSI_PROT_REF_INCREMENT, [SCSI_PROT_WRITE_STRIP] = SCSI_PROT_GUARD_CHECK | SCSI_PROT_REF_CHECK | SCSI_PROT_REF_INCREMENT | SCSI_PROT_IP_CHECKSUM, [SCSI_PROT_WRITE_PASS] = SCSI_PROT_TRANSFER_PI | SCSI_PROT_GUARD_CHECK | SCSI_PROT_REF_CHECK | SCSI_PROT_REF_INCREMENT | SCSI_PROT_IP_CHECKSUM, }; return flag_mask[prot_op]; } static unsigned char sd_setup_protect_cmnd(struct scsi_cmnd *scmd, unsigned int dix, unsigned int dif) { struct request *rq = scsi_cmd_to_rq(scmd); struct bio *bio = rq->bio; unsigned int prot_op = sd_prot_op(rq_data_dir(rq), dix, dif); unsigned int protect = 0; if (dix) { /* DIX Type 0, 1, 2, 3 */ if (bio_integrity_flagged(bio, BIP_IP_CHECKSUM)) scmd->prot_flags |= SCSI_PROT_IP_CHECKSUM; if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false) scmd->prot_flags |= SCSI_PROT_GUARD_CHECK; } if (dif != T10_PI_TYPE3_PROTECTION) { /* DIX/DIF Type 0, 1, 2 */ scmd->prot_flags |= SCSI_PROT_REF_INCREMENT; if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false) scmd->prot_flags |= SCSI_PROT_REF_CHECK; } if (dif) { /* DIX/DIF Type 1, 2, 3 */ scmd->prot_flags |= SCSI_PROT_TRANSFER_PI; if (bio_integrity_flagged(bio, BIP_DISK_NOCHECK)) protect = 3 << 5; /* Disable target PI checking */ else protect = 1 << 5; /* Enable target PI checking */ } scsi_set_prot_op(scmd, prot_op); scsi_set_prot_type(scmd, dif); scmd->prot_flags &= sd_prot_flag_mask(prot_op); return protect; } static void sd_disable_discard(struct scsi_disk *sdkp) { sdkp->provisioning_mode = SD_LBP_DISABLE; blk_queue_disable_discard(sdkp->disk->queue); } static void sd_config_discard(struct scsi_disk *sdkp, struct queue_limits *lim, unsigned int mode) { unsigned int logical_block_size = sdkp->device->sector_size; unsigned int max_blocks = 0; lim->discard_alignment = sdkp->unmap_alignment * logical_block_size; lim->discard_granularity = max(sdkp->physical_block_size, sdkp->unmap_granularity * logical_block_size); sdkp->provisioning_mode = mode; switch (mode) { case SD_LBP_FULL: case SD_LBP_DISABLE: break; case SD_LBP_UNMAP: max_blocks = min_not_zero(sdkp->max_unmap_blocks, (u32)SD_MAX_WS16_BLOCKS); break; case SD_LBP_WS16: if (sdkp->device->unmap_limit_for_ws) max_blocks = sdkp->max_unmap_blocks; else max_blocks = sdkp->max_ws_blocks; max_blocks = min_not_zero(max_blocks, (u32)SD_MAX_WS16_BLOCKS); break; case SD_LBP_WS10: if (sdkp->device->unmap_limit_for_ws) max_blocks = sdkp->max_unmap_blocks; else max_blocks = sdkp->max_ws_blocks; max_blocks = min_not_zero(max_blocks, (u32)SD_MAX_WS10_BLOCKS); break; case SD_LBP_ZERO: max_blocks = min_not_zero(sdkp->max_ws_blocks, (u32)SD_MAX_WS10_BLOCKS); break; } lim->max_hw_discard_sectors = max_blocks * (logical_block_size >> SECTOR_SHIFT); } static void *sd_set_special_bvec(struct request *rq, unsigned int data_len) { struct page *page; page = mempool_alloc(sd_page_pool, GFP_ATOMIC); if (!page) return NULL; clear_highpage(page); bvec_set_page(&rq->special_vec, page, data_len, 0); rq->rq_flags |= RQF_SPECIAL_PAYLOAD; return bvec_virt(&rq->special_vec); } static blk_status_t sd_setup_unmap_cmnd(struct scsi_cmnd *cmd) { struct scsi_device *sdp = cmd->device; struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_disk *sdkp = scsi_disk(rq->q->disk); u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); unsigned int data_len = 24; char *buf; buf = sd_set_special_bvec(rq, data_len); if (!buf) return BLK_STS_RESOURCE; cmd->cmd_len = 10; cmd->cmnd[0] = UNMAP; cmd->cmnd[8] = 24; put_unaligned_be16(6 + 16, &buf[0]); put_unaligned_be16(16, &buf[2]); put_unaligned_be64(lba, &buf[8]); put_unaligned_be32(nr_blocks, &buf[16]); cmd->allowed = sdkp->max_retries; cmd->transfersize = data_len; rq->timeout = SD_TIMEOUT; return scsi_alloc_sgtables(cmd); } static void sd_config_atomic(struct scsi_disk *sdkp, struct queue_limits *lim) { unsigned int logical_block_size = sdkp->device->sector_size, physical_block_size_sectors, max_atomic, unit_min, unit_max; if ((!sdkp->max_atomic && !sdkp->max_atomic_with_boundary) || sdkp->protection_type == T10_PI_TYPE2_PROTECTION) return; physical_block_size_sectors = sdkp->physical_block_size / sdkp->device->sector_size; unit_min = rounddown_pow_of_two(sdkp->atomic_granularity ? sdkp->atomic_granularity : physical_block_size_sectors); /* * Only use atomic boundary when we have the odd scenario of * sdkp->max_atomic == 0, which the spec does permit. */ if (sdkp->max_atomic) { max_atomic = sdkp->max_atomic; unit_max = rounddown_pow_of_two(sdkp->max_atomic); sdkp->use_atomic_write_boundary = 0; } else { max_atomic = sdkp->max_atomic_with_boundary; unit_max = rounddown_pow_of_two(sdkp->max_atomic_boundary); sdkp->use_atomic_write_boundary = 1; } /* * Ensure compliance with granularity and alignment. For now, keep it * simple and just don't support atomic writes for values mismatched * with max_{boundary}atomic, physical block size, and * atomic_granularity itself. * * We're really being distrustful by checking unit_max also... */ if (sdkp->atomic_granularity > 1) { if (unit_min > 1 && unit_min % sdkp->atomic_granularity) return; if (unit_max > 1 && unit_max % sdkp->atomic_granularity) return; } if (sdkp->atomic_alignment > 1) { if (unit_min > 1 && unit_min % sdkp->atomic_alignment) return; if (unit_max > 1 && unit_max % sdkp->atomic_alignment) return; } lim->atomic_write_hw_max = max_atomic * logical_block_size; lim->atomic_write_hw_boundary = 0; lim->atomic_write_hw_unit_min = unit_min * logical_block_size; lim->atomic_write_hw_unit_max = unit_max * logical_block_size; } static blk_status_t sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, bool unmap) { struct scsi_device *sdp = cmd->device; struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_disk *sdkp = scsi_disk(rq->q->disk); u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); u32 data_len = sdp->sector_size; if (!sd_set_special_bvec(rq, data_len)) return BLK_STS_RESOURCE; cmd->cmd_len = 16; cmd->cmnd[0] = WRITE_SAME_16; if (unmap) cmd->cmnd[1] = 0x8; /* UNMAP */ put_unaligned_be64(lba, &cmd->cmnd[2]); put_unaligned_be32(nr_blocks, &cmd->cmnd[10]); cmd->allowed = sdkp->max_retries; cmd->transfersize = data_len; rq->timeout = unmap ? SD_TIMEOUT : SD_WRITE_SAME_TIMEOUT; return scsi_alloc_sgtables(cmd); } static blk_status_t sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, bool unmap) { struct scsi_device *sdp = cmd->device; struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_disk *sdkp = scsi_disk(rq->q->disk); u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); u32 data_len = sdp->sector_size; if (!sd_set_special_bvec(rq, data_len)) return BLK_STS_RESOURCE; cmd->cmd_len = 10; cmd->cmnd[0] = WRITE_SAME; if (unmap) cmd->cmnd[1] = 0x8; /* UNMAP */ put_unaligned_be32(lba, &cmd->cmnd[2]); put_unaligned_be16(nr_blocks, &cmd->cmnd[7]); cmd->allowed = sdkp->max_retries; cmd->transfersize = data_len; rq->timeout = unmap ? SD_TIMEOUT : SD_WRITE_SAME_TIMEOUT; return scsi_alloc_sgtables(cmd); } static blk_status_t sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_device *sdp = cmd->device; struct scsi_disk *sdkp = scsi_disk(rq->q->disk); u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); if (!(rq->cmd_flags & REQ_NOUNMAP)) { switch (sdkp->zeroing_mode) { case SD_ZERO_WS16_UNMAP: return sd_setup_write_same16_cmnd(cmd, true); case SD_ZERO_WS10_UNMAP: return sd_setup_write_same10_cmnd(cmd, true); } } if (sdp->no_write_same) { rq->rq_flags |= RQF_QUIET; return BLK_STS_TARGET; } if (sdkp->ws16 || lba > 0xffffffff || nr_blocks > 0xffff) return sd_setup_write_same16_cmnd(cmd, false); return sd_setup_write_same10_cmnd(cmd, false); } static void sd_disable_write_same(struct scsi_disk *sdkp) { sdkp->device->no_write_same = 1; sdkp->max_ws_blocks = 0; blk_queue_disable_write_zeroes(sdkp->disk->queue); } static void sd_config_write_same(struct scsi_disk *sdkp, struct queue_limits *lim) { unsigned int logical_block_size = sdkp->device->sector_size; if (sdkp->device->no_write_same) { sdkp->max_ws_blocks = 0; goto out; } /* Some devices can not handle block counts above 0xffff despite * supporting WRITE SAME(16). Consequently we default to 64k * blocks per I/O unless the device explicitly advertises a * bigger limit. */ if (sdkp->max_ws_blocks > SD_MAX_WS10_BLOCKS) sdkp->max_ws_blocks = min_not_zero(sdkp->max_ws_blocks, (u32)SD_MAX_WS16_BLOCKS); else if (sdkp->ws16 || sdkp->ws10 || sdkp->device->no_report_opcodes) sdkp->max_ws_blocks = min_not_zero(sdkp->max_ws_blocks, (u32)SD_MAX_WS10_BLOCKS); else { sdkp->device->no_write_same = 1; sdkp->max_ws_blocks = 0; } if (sdkp->lbprz && sdkp->lbpws) sdkp->zeroing_mode = SD_ZERO_WS16_UNMAP; else if (sdkp->lbprz && sdkp->lbpws10) sdkp->zeroing_mode = SD_ZERO_WS10_UNMAP; else if (sdkp->max_ws_blocks) sdkp->zeroing_mode = SD_ZERO_WS; else sdkp->zeroing_mode = SD_ZERO_WRITE; if (sdkp->max_ws_blocks && sdkp->physical_block_size > logical_block_size) { /* * Reporting a maximum number of blocks that is not aligned * on the device physical size would cause a large write same * request to be split into physically unaligned chunks by * __blkdev_issue_write_zeroes() even if the caller of this * functions took care to align the large request. So make sure * the maximum reported is aligned to the device physical block * size. This is only an optional optimization for regular * disks, but this is mandatory to avoid failure of large write * same requests directed at sequential write required zones of * host-managed ZBC disks. */ sdkp->max_ws_blocks = round_down(sdkp->max_ws_blocks, bytes_to_logical(sdkp->device, sdkp->physical_block_size)); } out: lim->max_write_zeroes_sectors = sdkp->max_ws_blocks * (logical_block_size >> SECTOR_SHIFT); } static blk_status_t sd_setup_flush_cmnd(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_disk *sdkp = scsi_disk(rq->q->disk); /* flush requests don't perform I/O, zero the S/G table */ memset(&cmd->sdb, 0, sizeof(cmd->sdb)); if (cmd->device->use_16_for_sync) { cmd->cmnd[0] = SYNCHRONIZE_CACHE_16; cmd->cmd_len = 16; } else { cmd->cmnd[0] = SYNCHRONIZE_CACHE; cmd->cmd_len = 10; } cmd->transfersize = 0; cmd->allowed = sdkp->max_retries; rq->timeout = rq->q->rq_timeout * SD_FLUSH_TIMEOUT_MULTIPLIER; return BLK_STS_OK; } /** * sd_group_number() - Compute the GROUP NUMBER field * @cmd: SCSI command for which to compute the value of the six-bit GROUP NUMBER * field. * * From SBC-5 r05 (https://www.t10.org/cgi-bin/ac.pl?t=f&f=sbc5r05.pdf): * 0: no relative lifetime. * 1: shortest relative lifetime. * 2: second shortest relative lifetime. * 3 - 0x3d: intermediate relative lifetimes. * 0x3e: second longest relative lifetime. * 0x3f: longest relative lifetime. */ static u8 sd_group_number(struct scsi_cmnd *cmd) { const struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_disk *sdkp = scsi_disk(rq->q->disk); if (!sdkp->rscs) return 0; return min3((u32)rq->bio->bi_write_hint, (u32)sdkp->permanent_stream_count, 0x3fu); } static blk_status_t sd_setup_rw32_cmnd(struct scsi_cmnd *cmd, bool write, sector_t lba, unsigned int nr_blocks, unsigned char flags, unsigned int dld) { cmd->cmd_len = SD_EXT_CDB_SIZE; cmd->cmnd[0] = VARIABLE_LENGTH_CMD; cmd->cmnd[6] = sd_group_number(cmd); cmd->cmnd[7] = 0x18; /* Additional CDB len */ cmd->cmnd[9] = write ? WRITE_32 : READ_32; cmd->cmnd[10] = flags; cmd->cmnd[11] = dld & 0x07; put_unaligned_be64(lba, &cmd->cmnd[12]); put_unaligned_be32(lba, &cmd->cmnd[20]); /* Expected Indirect LBA */ put_unaligned_be32(nr_blocks, &cmd->cmnd[28]); return BLK_STS_OK; } static blk_status_t sd_setup_rw16_cmnd(struct scsi_cmnd *cmd, bool write, sector_t lba, unsigned int nr_blocks, unsigned char flags, unsigned int dld) { cmd->cmd_len = 16; cmd->cmnd[0] = write ? WRITE_16 : READ_16; cmd->cmnd[1] = flags | ((dld >> 2) & 0x01); cmd->cmnd[14] = ((dld & 0x03) << 6) | sd_group_number(cmd); cmd->cmnd[15] = 0; put_unaligned_be64(lba, &cmd->cmnd[2]); put_unaligned_be32(nr_blocks, &cmd->cmnd[10]); return BLK_STS_OK; } static blk_status_t sd_setup_rw10_cmnd(struct scsi_cmnd *cmd, bool write, sector_t lba, unsigned int nr_blocks, unsigned char flags) { cmd->cmd_len = 10; cmd->cmnd[0] = write ? WRITE_10 : READ_10; cmd->cmnd[1] = flags; cmd->cmnd[6] = sd_group_number(cmd); cmd->cmnd[9] = 0; put_unaligned_be32(lba, &cmd->cmnd[2]); put_unaligned_be16(nr_blocks, &cmd->cmnd[7]); return BLK_STS_OK; } static blk_status_t sd_setup_rw6_cmnd(struct scsi_cmnd *cmd, bool write, sector_t lba, unsigned int nr_blocks, unsigned char flags) { /* Avoid that 0 blocks gets translated into 256 blocks. */ if (WARN_ON_ONCE(nr_blocks == 0)) return BLK_STS_IOERR; if (unlikely(flags & 0x8)) { /* * This happens only if this drive failed 10byte rw * command with ILLEGAL_REQUEST during operation and * thus turned off use_10_for_rw. */ scmd_printk(KERN_ERR, cmd, "FUA write on READ/WRITE(6) drive\n"); return BLK_STS_IOERR; } cmd->cmd_len = 6; cmd->cmnd[0] = write ? WRITE_6 : READ_6; cmd->cmnd[1] = (lba >> 16) & 0x1f; cmd->cmnd[2] = (lba >> 8) & 0xff; cmd->cmnd[3] = lba & 0xff; cmd->cmnd[4] = nr_blocks; cmd->cmnd[5] = 0; return BLK_STS_OK; } /* * Check if a command has a duration limit set. If it does, and the target * device supports CDL and the feature is enabled, return the limit * descriptor index to use. Return 0 (no limit) otherwise. */ static int sd_cdl_dld(struct scsi_disk *sdkp, struct scsi_cmnd *scmd) { struct scsi_device *sdp = sdkp->device; int hint; if (!sdp->cdl_supported || !sdp->cdl_enable) return 0; /* * Use "no limit" if the request ioprio does not specify a duration * limit hint. */ hint = IOPRIO_PRIO_HINT(req_get_ioprio(scsi_cmd_to_rq(scmd))); if (hint < IOPRIO_HINT_DEV_DURATION_LIMIT_1 || hint > IOPRIO_HINT_DEV_DURATION_LIMIT_7) return 0; return (hint - IOPRIO_HINT_DEV_DURATION_LIMIT_1) + 1; } static blk_status_t sd_setup_atomic_cmnd(struct scsi_cmnd *cmd, sector_t lba, unsigned int nr_blocks, bool boundary, unsigned char flags) { cmd->cmd_len = 16; cmd->cmnd[0] = WRITE_ATOMIC_16; cmd->cmnd[1] = flags; put_unaligned_be64(lba, &cmd->cmnd[2]); put_unaligned_be16(nr_blocks, &cmd->cmnd[12]); if (boundary) put_unaligned_be16(nr_blocks, &cmd->cmnd[10]); else put_unaligned_be16(0, &cmd->cmnd[10]); put_unaligned_be16(nr_blocks, &cmd->cmnd[12]); cmd->cmnd[14] = 0; cmd->cmnd[15] = 0; return BLK_STS_OK; } static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_device *sdp = cmd->device; struct scsi_disk *sdkp = scsi_disk(rq->q->disk); sector_t lba = sectors_to_logical(sdp, blk_rq_pos(rq)); sector_t threshold; unsigned int nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); unsigned int mask = logical_to_sectors(sdp, 1) - 1; bool write = rq_data_dir(rq) == WRITE; unsigned char protect, fua; unsigned int dld; blk_status_t ret; unsigned int dif; bool dix; ret = scsi_alloc_sgtables(cmd); if (ret != BLK_STS_OK) return ret; ret = BLK_STS_IOERR; if (!scsi_device_online(sdp) || sdp->changed) { scmd_printk(KERN_ERR, cmd, "device offline or changed\n"); goto fail; } if (blk_rq_pos(rq) + blk_rq_sectors(rq) > get_capacity(rq->q->disk)) { scmd_printk(KERN_ERR, cmd, "access beyond end of device\n"); goto fail; } if ((blk_rq_pos(rq) & mask) || (blk_rq_sectors(rq) & mask)) { scmd_printk(KERN_ERR, cmd, "request not aligned to the logical block size\n"); goto fail; } /* * Some SD card readers can't handle accesses which touch the * last one or two logical blocks. Split accesses as needed. */ threshold = sdkp->capacity - SD_LAST_BUGGY_SECTORS; if (unlikely(sdp->last_sector_bug && lba + nr_blocks > threshold)) { if (lba < threshold) { /* Access up to the threshold but not beyond */ nr_blocks = threshold - lba; } else { /* Access only a single logical block */ nr_blocks = 1; } } fua = rq->cmd_flags & REQ_FUA ? 0x8 : 0; dix = scsi_prot_sg_count(cmd); dif = scsi_host_dif_capable(cmd->device->host, sdkp->protection_type); dld = sd_cdl_dld(sdkp, cmd); if (dif || dix) protect = sd_setup_protect_cmnd(cmd, dix, dif); else protect = 0; if (protect && sdkp->protection_type == T10_PI_TYPE2_PROTECTION) { ret = sd_setup_rw32_cmnd(cmd, write, lba, nr_blocks, protect | fua, dld); } else if (rq->cmd_flags & REQ_ATOMIC) { ret = sd_setup_atomic_cmnd(cmd, lba, nr_blocks, sdkp->use_atomic_write_boundary, protect | fua); } else if (sdp->use_16_for_rw || (nr_blocks > 0xffff)) { ret = sd_setup_rw16_cmnd(cmd, write, lba, nr_blocks, protect | fua, dld); } else if ((nr_blocks > 0xff) || (lba > 0x1fffff) || sdp->use_10_for_rw || protect || rq->bio->bi_write_hint) { ret = sd_setup_rw10_cmnd(cmd, write, lba, nr_blocks, protect | fua); } else { ret = sd_setup_rw6_cmnd(cmd, write, lba, nr_blocks, protect | fua); } if (unlikely(ret != BLK_STS_OK)) goto fail; /* * We shouldn't disconnect in the middle of a sector, so with a dumb * host adapter, it's safe to assume that we can at least transfer * this many bytes between each connect / disconnect. */ cmd->transfersize = sdp->sector_size; cmd->underflow = nr_blocks << 9; cmd->allowed = sdkp->max_retries; cmd->sdb.length = nr_blocks * sdp->sector_size; SCSI_LOG_HLQUEUE(1, scmd_printk(KERN_INFO, cmd, "%s: block=%llu, count=%d\n", __func__, (unsigned long long)blk_rq_pos(rq), blk_rq_sectors(rq))); SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, cmd, "%s %d/%u 512 byte blocks.\n", write ? "writing" : "reading", nr_blocks, blk_rq_sectors(rq))); /* * This indicates that the command is ready from our end to be queued. */ return BLK_STS_OK; fail: scsi_free_sgtables(cmd); return ret; } static blk_status_t sd_init_command(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); switch (req_op(rq)) { case REQ_OP_DISCARD: switch (scsi_disk(rq->q->disk)->provisioning_mode) { case SD_LBP_UNMAP: return sd_setup_unmap_cmnd(cmd); case SD_LBP_WS16: return sd_setup_write_same16_cmnd(cmd, true); case SD_LBP_WS10: return sd_setup_write_same10_cmnd(cmd, true); case SD_LBP_ZERO: return sd_setup_write_same10_cmnd(cmd, false); default: return BLK_STS_TARGET; } case REQ_OP_WRITE_ZEROES: return sd_setup_write_zeroes_cmnd(cmd); case REQ_OP_FLUSH: return sd_setup_flush_cmnd(cmd); case REQ_OP_READ: case REQ_OP_WRITE: return sd_setup_read_write_cmnd(cmd); case REQ_OP_ZONE_RESET: return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_RESET_WRITE_POINTER, false); case REQ_OP_ZONE_RESET_ALL: return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_RESET_WRITE_POINTER, true); case REQ_OP_ZONE_OPEN: return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_OPEN_ZONE, false); case REQ_OP_ZONE_CLOSE: return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_CLOSE_ZONE, false); case REQ_OP_ZONE_FINISH: return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_FINISH_ZONE, false); default: WARN_ON_ONCE(1); return BLK_STS_NOTSUPP; } } static void sd_uninit_command(struct scsi_cmnd *SCpnt) { struct request *rq = scsi_cmd_to_rq(SCpnt); if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) mempool_free(rq->special_vec.bv_page, sd_page_pool); } static bool sd_need_revalidate(struct gendisk *disk, struct scsi_disk *sdkp) { if (sdkp->device->removable || sdkp->write_prot) { if (disk_check_media_change(disk)) return true; } /* * Force a full rescan after ioctl(BLKRRPART). While the disk state has * nothing to do with partitions, BLKRRPART is used to force a full * revalidate after things like a format for historical reasons. */ return test_bit(GD_NEED_PART_SCAN, &disk->state); } /** * sd_open - open a scsi disk device * @disk: disk to open * @mode: open mode * * Returns 0 if successful. Returns a negated errno value in case * of error. * * Note: This can be called from a user context (e.g. fsck(1) ) * or from within the kernel (e.g. as a result of a mount(1) ). * In the latter case @inode and @filp carry an abridged amount * of information as noted above. * * Locking: called with disk->open_mutex held. **/ static int sd_open(struct gendisk *disk, blk_mode_t mode) { struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdev = sdkp->device; int retval; if (scsi_device_get(sdev)) return -ENXIO; SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_open\n")); /* * If the device is in error recovery, wait until it is done. * If the device is offline, then disallow any access to it. */ retval = -ENXIO; if (!scsi_block_when_processing_errors(sdev)) goto error_out; if (sd_need_revalidate(disk, sdkp)) sd_revalidate_disk(disk); /* * If the drive is empty, just let the open fail. */ retval = -ENOMEDIUM; if (sdev->removable && !sdkp->media_present && !(mode & BLK_OPEN_NDELAY)) goto error_out; /* * If the device has the write protect tab set, have the open fail * if the user expects to be able to write to the thing. */ retval = -EROFS; if (sdkp->write_prot && (mode & BLK_OPEN_WRITE)) goto error_out; /* * It is possible that the disk changing stuff resulted in * the device being taken offline. If this is the case, * report this to the user, and don't pretend that the * open actually succeeded. */ retval = -ENXIO; if (!scsi_device_online(sdev)) goto error_out; if ((atomic_inc_return(&sdkp->openers) == 1) && sdev->removable) { if (scsi_block_when_processing_errors(sdev)) scsi_set_medium_removal(sdev, SCSI_REMOVAL_PREVENT); } return 0; error_out: scsi_device_put(sdev); return retval; } /** * sd_release - invoked when the (last) close(2) is called on this * scsi disk. * @disk: disk to release * * Returns 0. * * Note: may block (uninterruptible) if error recovery is underway * on this disk. * * Locking: called with disk->open_mutex held. **/ static void sd_release(struct gendisk *disk) { struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdev = sdkp->device; SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_release\n")); if (atomic_dec_return(&sdkp->openers) == 0 && sdev->removable) { if (scsi_block_when_processing_errors(sdev)) scsi_set_medium_removal(sdev, SCSI_REMOVAL_ALLOW); } scsi_device_put(sdev); } static int sd_getgeo(struct block_device *bdev, struct hd_geometry *geo) { struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk); struct scsi_device *sdp = sdkp->device; struct Scsi_Host *host = sdp->host; sector_t capacity = logical_to_sectors(sdp, sdkp->capacity); int diskinfo[4]; /* default to most commonly used values */ diskinfo[0] = 0x40; /* 1 << 6 */ diskinfo[1] = 0x20; /* 1 << 5 */ diskinfo[2] = capacity >> 11; /* override with calculated, extended default, or driver values */ if (host->hostt->bios_param) host->hostt->bios_param(sdp, bdev, capacity, diskinfo); else scsicam_bios_param(bdev, capacity, diskinfo); geo->heads = diskinfo[0]; geo->sectors = diskinfo[1]; geo->cylinders = diskinfo[2]; return 0; } /** * sd_ioctl - process an ioctl * @bdev: target block device * @mode: open mode * @cmd: ioctl command number * @arg: this is third argument given to ioctl(2) system call. * Often contains a pointer. * * Returns 0 if successful (some ioctls return positive numbers on * success as well). Returns a negated errno value in case of error. * * Note: most ioctls are forward onto the block subsystem or further * down in the scsi subsystem. **/ static int sd_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct gendisk *disk = bdev->bd_disk; struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdp = sdkp->device; void __user *p = (void __user *)arg; int error; SCSI_LOG_IOCTL(1, sd_printk(KERN_INFO, sdkp, "sd_ioctl: disk=%s, " "cmd=0x%x\n", disk->disk_name, cmd)); if (bdev_is_partition(bdev) && !capable(CAP_SYS_RAWIO)) return -ENOIOCTLCMD; /* * If we are in the middle of error recovery, don't let anyone * else try and use this device. Also, if error recovery fails, it * may try and take the device offline, in which case all further * access to the device is prohibited. */ error = scsi_ioctl_block_when_processing_errors(sdp, cmd, (mode & BLK_OPEN_NDELAY)); if (error) return error; if (is_sed_ioctl(cmd)) return sed_ioctl(sdkp->opal_dev, cmd, p); return scsi_ioctl(sdp, mode & BLK_OPEN_WRITE, cmd, p); } static void set_media_not_present(struct scsi_disk *sdkp) { if (sdkp->media_present) sdkp->device->changed = 1; if (sdkp->device->removable) { sdkp->media_present = 0; sdkp->capacity = 0; } } static int media_not_present(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr) { if (!scsi_sense_valid(sshdr)) return 0; /* not invoked for commands that could return deferred errors */ switch (sshdr->sense_key) { case UNIT_ATTENTION: case NOT_READY: /* medium not present */ if (sshdr->asc == 0x3A) { set_media_not_present(sdkp); return 1; } } return 0; } /** * sd_check_events - check media events * @disk: kernel device descriptor * @clearing: disk events currently being cleared * * Returns mask of DISK_EVENT_*. * * Note: this function is invoked from the block subsystem. **/ static unsigned int sd_check_events(struct gendisk *disk, unsigned int clearing) { struct scsi_disk *sdkp = disk->private_data; struct scsi_device *sdp; int retval; bool disk_changed; if (!sdkp) return 0; sdp = sdkp->device; SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_check_events\n")); /* * If the device is offline, don't send any commands - just pretend as * if the command failed. If the device ever comes back online, we * can deal with it then. It is only because of unrecoverable errors * that we would ever take a device offline in the first place. */ if (!scsi_device_online(sdp)) { set_media_not_present(sdkp); goto out; } /* * Using TEST_UNIT_READY enables differentiation between drive with * no cartridge loaded - NOT READY, drive with changed cartridge - * UNIT ATTENTION, or with same cartridge - GOOD STATUS. * * Drives that auto spin down. eg iomega jaz 1G, will be started * by sd_spinup_disk() from sd_revalidate_disk(), which happens whenever * sd_revalidate() is called. */ if (scsi_block_when_processing_errors(sdp)) { struct scsi_sense_hdr sshdr = { 0, }; retval = scsi_test_unit_ready(sdp, SD_TIMEOUT, sdkp->max_retries, &sshdr); /* failed to execute TUR, assume media not present */ if (retval < 0 || host_byte(retval)) { set_media_not_present(sdkp); goto out; } if (media_not_present(sdkp, &sshdr)) goto out; } /* * For removable scsi disk we have to recognise the presence * of a disk in the drive. */ if (!sdkp->media_present) sdp->changed = 1; sdkp->media_present = 1; out: /* * sdp->changed is set under the following conditions: * * Medium present state has changed in either direction. * Device has indicated UNIT_ATTENTION. */ disk_changed = sdp->changed; sdp->changed = 0; return disk_changed ? DISK_EVENT_MEDIA_CHANGE : 0; } static int sd_sync_cache(struct scsi_disk *sdkp) { int res; struct scsi_device *sdp = sdkp->device; const int timeout = sdp->request_queue->rq_timeout * SD_FLUSH_TIMEOUT_MULTIPLIER; /* Leave the rest of the command zero to indicate flush everything. */ const unsigned char cmd[16] = { sdp->use_16_for_sync ? SYNCHRONIZE_CACHE_16 : SYNCHRONIZE_CACHE }; struct scsi_sense_hdr sshdr; struct scsi_failure failure_defs[] = { { .allowed = 3, .result = SCMD_FAILURE_RESULT_ANY, }, {} }; struct scsi_failures failures = { .failure_definitions = failure_defs, }; const struct scsi_exec_args exec_args = { .req_flags = BLK_MQ_REQ_PM, .sshdr = &sshdr, .failures = &failures, }; if (!scsi_device_online(sdp)) return -ENODEV; res = scsi_execute_cmd(sdp, cmd, REQ_OP_DRV_IN, NULL, 0, timeout, sdkp->max_retries, &exec_args); if (res) { sd_print_result(sdkp, "Synchronize Cache(10) failed", res); if (res < 0) return res; if (scsi_status_is_check_condition(res) && scsi_sense_valid(&sshdr)) { sd_print_sense_hdr(sdkp, &sshdr); /* we need to evaluate the error return */ if (sshdr.asc == 0x3a || /* medium not present */ sshdr.asc == 0x20 || /* invalid command */ (sshdr.asc == 0x74 && sshdr.ascq == 0x71)) /* drive is password locked */ /* this is no error here */ return 0; /* * If a format is in progress or if the drive does not * support sync, there is not much we can do because * this is called during shutdown or suspend so just * return success so those operations can proceed. */ if ((sshdr.asc == 0x04 && sshdr.ascq == 0x04) || sshdr.sense_key == ILLEGAL_REQUEST) return 0; } switch (host_byte(res)) { /* ignore errors due to racing a disconnection */ case DID_BAD_TARGET: case DID_NO_CONNECT: return 0; /* signal the upper layer it might try again */ case DID_BUS_BUSY: case DID_IMM_RETRY: case DID_REQUEUE: case DID_SOFT_ERROR: return -EBUSY; default: return -EIO; } } return 0; } static void sd_rescan(struct device *dev) { struct scsi_disk *sdkp = dev_get_drvdata(dev); sd_revalidate_disk(sdkp->disk); } static int sd_get_unique_id(struct gendisk *disk, u8 id[16], enum blk_unique_id type) { struct scsi_device *sdev = scsi_disk(disk)->device; const struct scsi_vpd *vpd; const unsigned char *d; int ret = -ENXIO, len; rcu_read_lock(); vpd = rcu_dereference(sdev->vpd_pg83); if (!vpd) goto out_unlock; ret = -EINVAL; for (d = vpd->data + 4; d < vpd->data + vpd->len; d += d[3] + 4) { /* we only care about designators with LU association */ if (((d[1] >> 4) & 0x3) != 0x00) continue; if ((d[1] & 0xf) != type) continue; /* * Only exit early if a 16-byte descriptor was found. Otherwise * keep looking as one with more entropy might still show up. */ len = d[3]; if (len != 8 && len != 12 && len != 16) continue; ret = len; memcpy(id, d + 4, len); if (len == 16) break; } out_unlock: rcu_read_unlock(); return ret; } static int sd_scsi_to_pr_err(struct scsi_sense_hdr *sshdr, int result) { switch (host_byte(result)) { case DID_TRANSPORT_MARGINAL: case DID_TRANSPORT_DISRUPTED: case DID_BUS_BUSY: return PR_STS_RETRY_PATH_FAILURE; case DID_NO_CONNECT: return PR_STS_PATH_FAILED; case DID_TRANSPORT_FAILFAST: return PR_STS_PATH_FAST_FAILED; } switch (status_byte(result)) { case SAM_STAT_RESERVATION_CONFLICT: return PR_STS_RESERVATION_CONFLICT; case SAM_STAT_CHECK_CONDITION: if (!scsi_sense_valid(sshdr)) return PR_STS_IOERR; if (sshdr->sense_key == ILLEGAL_REQUEST && (sshdr->asc == 0x26 || sshdr->asc == 0x24)) return -EINVAL; fallthrough; default: return PR_STS_IOERR; } } static int sd_pr_in_command(struct block_device *bdev, u8 sa, unsigned char *data, int data_len) { struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk); struct scsi_device *sdev = sdkp->device; struct scsi_sense_hdr sshdr; u8 cmd[10] = { PERSISTENT_RESERVE_IN, sa }; struct scsi_failure failure_defs[] = { { .sense = UNIT_ATTENTION, .asc = SCMD_FAILURE_ASC_ANY, .ascq = SCMD_FAILURE_ASCQ_ANY, .allowed = 5, .result = SAM_STAT_CHECK_CONDITION, }, {} }; struct scsi_failures failures = { .failure_definitions = failure_defs, }; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, .failures = &failures, }; int result; put_unaligned_be16(data_len, &cmd[7]); result = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_IN, data, data_len, SD_TIMEOUT, sdkp->max_retries, &exec_args); if (scsi_status_is_check_condition(result) && scsi_sense_valid(&sshdr)) { sdev_printk(KERN_INFO, sdev, "PR command failed: %d\n", result); scsi_print_sense_hdr(sdev, NULL, &sshdr); } if (result <= 0) return result; return sd_scsi_to_pr_err(&sshdr, result); } static int sd_pr_read_keys(struct block_device *bdev, struct pr_keys *keys_info) { int result, i, data_offset, num_copy_keys; u32 num_keys = keys_info->num_keys; int data_len = num_keys * 8 + 8; u8 *data; data = kzalloc(data_len, GFP_KERNEL); if (!data) return -ENOMEM; result = sd_pr_in_command(bdev, READ_KEYS, data, data_len); if (result) goto free_data; keys_info->generation = get_unaligned_be32(&data[0]); keys_info->num_keys = get_unaligned_be32(&data[4]) / 8; data_offset = 8; num_copy_keys = min(num_keys, keys_info->num_keys); for (i = 0; i < num_copy_keys; i++) { keys_info->keys[i] = get_unaligned_be64(&data[data_offset]); data_offset += 8; } free_data: kfree(data); return result; } static int sd_pr_read_reservation(struct block_device *bdev, struct pr_held_reservation *rsv) { struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk); struct scsi_device *sdev = sdkp->device; u8 data[24] = { }; int result, len; result = sd_pr_in_command(bdev, READ_RESERVATION, data, sizeof(data)); if (result) return result; len = get_unaligned_be32(&data[4]); if (!len) return 0; /* Make sure we have at least the key and type */ if (len < 14) { sdev_printk(KERN_INFO, sdev, "READ RESERVATION failed due to short return buffer of %d bytes\n", len); return -EINVAL; } rsv->generation = get_unaligned_be32(&data[0]); rsv->key = get_unaligned_be64(&data[8]); rsv->type = scsi_pr_type_to_block(data[21] & 0x0f); return 0; } static int sd_pr_out_command(struct block_device *bdev, u8 sa, u64 key, u64 sa_key, enum scsi_pr_type type, u8 flags) { struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk); struct scsi_device *sdev = sdkp->device; struct scsi_sense_hdr sshdr; struct scsi_failure failure_defs[] = { { .sense = UNIT_ATTENTION, .asc = SCMD_FAILURE_ASC_ANY, .ascq = SCMD_FAILURE_ASCQ_ANY, .allowed = 5, .result = SAM_STAT_CHECK_CONDITION, }, {} }; struct scsi_failures failures = { .failure_definitions = failure_defs, }; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, .failures = &failures, }; int result; u8 cmd[16] = { 0, }; u8 data[24] = { 0, }; cmd[0] = PERSISTENT_RESERVE_OUT; cmd[1] = sa; cmd[2] = type; put_unaligned_be32(sizeof(data), &cmd[5]); put_unaligned_be64(key, &data[0]); put_unaligned_be64(sa_key, &data[8]); data[20] = flags; result = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_OUT, &data, sizeof(data), SD_TIMEOUT, sdkp->max_retries, &exec_args); if (scsi_status_is_check_condition(result) && scsi_sense_valid(&sshdr)) { sdev_printk(KERN_INFO, sdev, "PR command failed: %d\n", result); scsi_print_sense_hdr(sdev, NULL, &sshdr); } if (result <= 0) return result; return sd_scsi_to_pr_err(&sshdr, result); } static int sd_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, u32 flags) { if (flags & ~PR_FL_IGNORE_KEY) return -EOPNOTSUPP; return sd_pr_out_command(bdev, (flags & PR_FL_IGNORE_KEY) ? 0x06 : 0x00, old_key, new_key, 0, (1 << 0) /* APTPL */); } static int sd_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, u32 flags) { if (flags) return -EOPNOTSUPP; return sd_pr_out_command(bdev, 0x01, key, 0, block_pr_type_to_scsi(type), 0); } static int sd_pr_release(struct block_device *bdev, u64 key, enum pr_type type) { return sd_pr_out_command(bdev, 0x02, key, 0, block_pr_type_to_scsi(type), 0); } static int sd_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key, enum pr_type type, bool abort) { return sd_pr_out_command(bdev, abort ? 0x05 : 0x04, old_key, new_key, block_pr_type_to_scsi(type), 0); } static int sd_pr_clear(struct block_device *bdev, u64 key) { return sd_pr_out_command(bdev, 0x03, key, 0, 0, 0); } static const struct pr_ops sd_pr_ops = { .pr_register = sd_pr_register, .pr_reserve = sd_pr_reserve, .pr_release = sd_pr_release, .pr_preempt = sd_pr_preempt, .pr_clear = sd_pr_clear, .pr_read_keys = sd_pr_read_keys, .pr_read_reservation = sd_pr_read_reservation, }; static void scsi_disk_free_disk(struct gendisk *disk) { struct scsi_disk *sdkp = scsi_disk(disk); put_device(&sdkp->disk_dev); } static const struct block_device_operations sd_fops = { .owner = THIS_MODULE, .open = sd_open, .release = sd_release, .ioctl = sd_ioctl, .getgeo = sd_getgeo, .compat_ioctl = blkdev_compat_ptr_ioctl, .check_events = sd_check_events, .unlock_native_capacity = sd_unlock_native_capacity, .report_zones = sd_zbc_report_zones, .get_unique_id = sd_get_unique_id, .free_disk = scsi_disk_free_disk, .pr_ops = &sd_pr_ops, }; /** * sd_eh_reset - reset error handling callback * @scmd: sd-issued command that has failed * * This function is called by the SCSI midlayer before starting * SCSI EH. When counting medium access failures we have to be * careful to register it only only once per device and SCSI EH run; * there might be several timed out commands which will cause the * 'max_medium_access_timeouts' counter to trigger after the first * SCSI EH run already and set the device to offline. * So this function resets the internal counter before starting SCSI EH. **/ static void sd_eh_reset(struct scsi_cmnd *scmd) { struct scsi_disk *sdkp = scsi_disk(scsi_cmd_to_rq(scmd)->q->disk); /* New SCSI EH run, reset gate variable */ sdkp->ignore_medium_access_errors = false; } /** * sd_eh_action - error handling callback * @scmd: sd-issued command that has failed * @eh_disp: The recovery disposition suggested by the midlayer * * This function is called by the SCSI midlayer upon completion of an * error test command (currently TEST UNIT READY). The result of sending * the eh command is passed in eh_disp. We're looking for devices that * fail medium access commands but are OK with non access commands like * test unit ready (so wrongly see the device as having a successful * recovery) **/ static int sd_eh_action(struct scsi_cmnd *scmd, int eh_disp) { struct scsi_disk *sdkp = scsi_disk(scsi_cmd_to_rq(scmd)->q->disk); struct scsi_device *sdev = scmd->device; if (!scsi_device_online(sdev) || !scsi_medium_access_command(scmd) || host_byte(scmd->result) != DID_TIME_OUT || eh_disp != SUCCESS) return eh_disp; /* * The device has timed out executing a medium access command. * However, the TEST UNIT READY command sent during error * handling completed successfully. Either the device is in the * process of recovering or has it suffered an internal failure * that prevents access to the storage medium. */ if (!sdkp->ignore_medium_access_errors) { sdkp->medium_access_timed_out++; sdkp->ignore_medium_access_errors = true; } /* * If the device keeps failing read/write commands but TEST UNIT * READY always completes successfully we assume that medium * access is no longer possible and take the device offline. */ if (sdkp->medium_access_timed_out >= sdkp->max_medium_access_timeouts) { scmd_printk(KERN_ERR, scmd, "Medium access timeout failure. Offlining disk!\n"); mutex_lock(&sdev->state_mutex); scsi_device_set_state(sdev, SDEV_OFFLINE); mutex_unlock(&sdev->state_mutex); return SUCCESS; } return eh_disp; } static unsigned int sd_completed_bytes(struct scsi_cmnd *scmd) { struct request *req = scsi_cmd_to_rq(scmd); struct scsi_device *sdev = scmd->device; unsigned int transferred, good_bytes; u64 start_lba, end_lba, bad_lba; /* * Some commands have a payload smaller than the device logical * block size (e.g. INQUIRY on a 4K disk). */ if (scsi_bufflen(scmd) <= sdev->sector_size) return 0; /* Check if we have a 'bad_lba' information */ if (!scsi_get_sense_info_fld(scmd->sense_buffer, SCSI_SENSE_BUFFERSIZE, &bad_lba)) return 0; /* * If the bad lba was reported incorrectly, we have no idea where * the error is. */ start_lba = sectors_to_logical(sdev, blk_rq_pos(req)); end_lba = start_lba + bytes_to_logical(sdev, scsi_bufflen(scmd)); if (bad_lba < start_lba || bad_lba >= end_lba) return 0; /* * resid is optional but mostly filled in. When it's unused, * its value is zero, so we assume the whole buffer transferred */ transferred = scsi_bufflen(scmd) - scsi_get_resid(scmd); /* This computation should always be done in terms of the * resolution of the device's medium. */ good_bytes = logical_to_bytes(sdev, bad_lba - start_lba); return min(good_bytes, transferred); } /** * sd_done - bottom half handler: called when the lower level * driver has completed (successfully or otherwise) a scsi command. * @SCpnt: mid-level's per command structure. * * Note: potentially run from within an ISR. Must not block. **/ static int sd_done(struct scsi_cmnd *SCpnt) { int result = SCpnt->result; unsigned int good_bytes = result ? 0 : scsi_bufflen(SCpnt); unsigned int sector_size = SCpnt->device->sector_size; unsigned int resid; struct scsi_sense_hdr sshdr; struct request *req = scsi_cmd_to_rq(SCpnt); struct scsi_disk *sdkp = scsi_disk(req->q->disk); int sense_valid = 0; int sense_deferred = 0; switch (req_op(req)) { case REQ_OP_DISCARD: case REQ_OP_WRITE_ZEROES: case REQ_OP_ZONE_RESET: case REQ_OP_ZONE_RESET_ALL: case REQ_OP_ZONE_OPEN: case REQ_OP_ZONE_CLOSE: case REQ_OP_ZONE_FINISH: if (!result) { good_bytes = blk_rq_bytes(req); scsi_set_resid(SCpnt, 0); } else { good_bytes = 0; scsi_set_resid(SCpnt, blk_rq_bytes(req)); } break; default: /* * In case of bogus fw or device, we could end up having * an unaligned partial completion. Check this here and force * alignment. */ resid = scsi_get_resid(SCpnt); if (resid & (sector_size - 1)) { sd_printk(KERN_INFO, sdkp, "Unaligned partial completion (resid=%u, sector_sz=%u)\n", resid, sector_size); scsi_print_command(SCpnt); resid = min(scsi_bufflen(SCpnt), round_up(resid, sector_size)); scsi_set_resid(SCpnt, resid); } } if (result) { sense_valid = scsi_command_normalize_sense(SCpnt, &sshdr); if (sense_valid) sense_deferred = scsi_sense_is_deferred(&sshdr); } sdkp->medium_access_timed_out = 0; if (!scsi_status_is_check_condition(result) && (!sense_valid || sense_deferred)) goto out; switch (sshdr.sense_key) { case HARDWARE_ERROR: case MEDIUM_ERROR: good_bytes = sd_completed_bytes(SCpnt); break; case RECOVERED_ERROR: good_bytes = scsi_bufflen(SCpnt); break; case NO_SENSE: /* This indicates a false check condition, so ignore it. An * unknown amount of data was transferred so treat it as an * error. */ SCpnt->result = 0; memset(SCpnt->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); break; case ABORTED_COMMAND: if (sshdr.asc == 0x10) /* DIF: Target detected corruption */ good_bytes = sd_completed_bytes(SCpnt); break; case ILLEGAL_REQUEST: switch (sshdr.asc) { case 0x10: /* DIX: Host detected corruption */ good_bytes = sd_completed_bytes(SCpnt); break; case 0x20: /* INVALID COMMAND OPCODE */ case 0x24: /* INVALID FIELD IN CDB */ switch (SCpnt->cmnd[0]) { case UNMAP: sd_disable_discard(sdkp); break; case WRITE_SAME_16: case WRITE_SAME: if (SCpnt->cmnd[1] & 8) { /* UNMAP */ sd_disable_discard(sdkp); } else { sd_disable_write_same(sdkp); req->rq_flags |= RQF_QUIET; } break; } } break; default: break; } out: if (sdkp->device->type == TYPE_ZBC) good_bytes = sd_zbc_complete(SCpnt, good_bytes, &sshdr); SCSI_LOG_HLCOMPLETE(1, scmd_printk(KERN_INFO, SCpnt, "sd_done: completed %d of %d bytes\n", good_bytes, scsi_bufflen(SCpnt))); return good_bytes; } /* * spinup disk - called only in sd_revalidate_disk() */ static void sd_spinup_disk(struct scsi_disk *sdkp) { static const u8 cmd[10] = { TEST_UNIT_READY }; unsigned long spintime_expire = 0; int spintime, sense_valid = 0; unsigned int the_result; struct scsi_sense_hdr sshdr; struct scsi_failure failure_defs[] = { /* Do not retry Medium Not Present */ { .sense = UNIT_ATTENTION, .asc = 0x3A, .ascq = SCMD_FAILURE_ASCQ_ANY, .result = SAM_STAT_CHECK_CONDITION, }, { .sense = NOT_READY, .asc = 0x3A, .ascq = SCMD_FAILURE_ASCQ_ANY, .result = SAM_STAT_CHECK_CONDITION, }, /* Retry when scsi_status_is_good would return false 3 times */ { .result = SCMD_FAILURE_STAT_ANY, .allowed = 3, }, {} }; struct scsi_failures failures = { .failure_definitions = failure_defs, }; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, .failures = &failures, }; spintime = 0; /* Spin up drives, as required. Only do this at boot time */ /* Spinup needs to be done for module loads too. */ do { bool media_was_present = sdkp->media_present; scsi_failures_reset_retries(&failures); the_result = scsi_execute_cmd(sdkp->device, cmd, REQ_OP_DRV_IN, NULL, 0, SD_TIMEOUT, sdkp->max_retries, &exec_args); if (the_result > 0) { /* * If the drive has indicated to us that it doesn't * have any media in it, don't bother with any more * polling. */ if (media_not_present(sdkp, &sshdr)) { if (media_was_present) sd_printk(KERN_NOTICE, sdkp, "Media removed, stopped polling\n"); return; } sense_valid = scsi_sense_valid(&sshdr); } if (!scsi_status_is_check_condition(the_result)) { /* no sense, TUR either succeeded or failed * with a status error */ if(!spintime && !scsi_status_is_good(the_result)) { sd_print_result(sdkp, "Test Unit Ready failed", the_result); } break; } /* * The device does not want the automatic start to be issued. */ if (sdkp->device->no_start_on_add) break; if (sense_valid && sshdr.sense_key == NOT_READY) { if (sshdr.asc == 4 && sshdr.ascq == 3) break; /* manual intervention required */ if (sshdr.asc == 4 && sshdr.ascq == 0xb) break; /* standby */ if (sshdr.asc == 4 && sshdr.ascq == 0xc) break; /* unavailable */ if (sshdr.asc == 4 && sshdr.ascq == 0x1b) break; /* sanitize in progress */ if (sshdr.asc == 4 && sshdr.ascq == 0x24) break; /* depopulation in progress */ if (sshdr.asc == 4 && sshdr.ascq == 0x25) break; /* depopulation restoration in progress */ /* * Issue command to spin up drive when not ready */ if (!spintime) { /* Return immediately and start spin cycle */ const u8 start_cmd[10] = { [0] = START_STOP, [1] = 1, [4] = sdkp->device->start_stop_pwr_cond ? 0x11 : 1, }; sd_printk(KERN_NOTICE, sdkp, "Spinning up disk..."); scsi_execute_cmd(sdkp->device, start_cmd, REQ_OP_DRV_IN, NULL, 0, SD_TIMEOUT, sdkp->max_retries, &exec_args); spintime_expire = jiffies + 100 * HZ; spintime = 1; } /* Wait 1 second for next try */ msleep(1000); printk(KERN_CONT "."); /* * Wait for USB flash devices with slow firmware. * Yes, this sense key/ASC combination shouldn't * occur here. It's characteristic of these devices. */ } else if (sense_valid && sshdr.sense_key == UNIT_ATTENTION && sshdr.asc == 0x28) { if (!spintime) { spintime_expire = jiffies + 5 * HZ; spintime = 1; } /* Wait 1 second for next try */ msleep(1000); } else { /* we don't understand the sense code, so it's * probably pointless to loop */ if(!spintime) { sd_printk(KERN_NOTICE, sdkp, "Unit Not Ready\n"); sd_print_sense_hdr(sdkp, &sshdr); } break; } } while (spintime && time_before_eq(jiffies, spintime_expire)); if (spintime) { if (scsi_status_is_good(the_result)) printk(KERN_CONT "ready\n"); else printk(KERN_CONT "not responding...\n"); } } /* * Determine whether disk supports Data Integrity Field. */ static int sd_read_protection_type(struct scsi_disk *sdkp, unsigned char *buffer) { struct scsi_device *sdp = sdkp->device; u8 type; if (scsi_device_protection(sdp) == 0 || (buffer[12] & 1) == 0) { sdkp->protection_type = 0; return 0; } type = ((buffer[12] >> 1) & 7) + 1; /* P_TYPE 0 = Type 1 */ if (type > T10_PI_TYPE3_PROTECTION) { sd_printk(KERN_ERR, sdkp, "formatted with unsupported" \ " protection type %u. Disabling disk!\n", type); sdkp->protection_type = 0; return -ENODEV; } sdkp->protection_type = type; return 0; } static void sd_config_protection(struct scsi_disk *sdkp, struct queue_limits *lim) { struct scsi_device *sdp = sdkp->device; if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) sd_dif_config_host(sdkp, lim); if (!sdkp->protection_type) return; if (!scsi_host_dif_capable(sdp->host, sdkp->protection_type)) { sd_first_printk(KERN_NOTICE, sdkp, "Disabling DIF Type %u protection\n", sdkp->protection_type); sdkp->protection_type = 0; } sd_first_printk(KERN_NOTICE, sdkp, "Enabling DIF Type %u protection\n", sdkp->protection_type); } static void read_capacity_error(struct scsi_disk *sdkp, struct scsi_device *sdp, struct scsi_sense_hdr *sshdr, int sense_valid, int the_result) { if (sense_valid) sd_print_sense_hdr(sdkp, sshdr); else sd_printk(KERN_NOTICE, sdkp, "Sense not available.\n"); /* * Set dirty bit for removable devices if not ready - * sometimes drives will not report this properly. */ if (sdp->removable && sense_valid && sshdr->sense_key == NOT_READY) set_media_not_present(sdkp); /* * We used to set media_present to 0 here to indicate no media * in the drive, but some drives fail read capacity even with * media present, so we can't do that. */ sdkp->capacity = 0; /* unknown mapped to zero - as usual */ } #define RC16_LEN 32 #if RC16_LEN > SD_BUF_SIZE #error RC16_LEN must not be more than SD_BUF_SIZE #endif #define READ_CAPACITY_RETRIES_ON_RESET 10 static int read_capacity_16(struct scsi_disk *sdkp, struct scsi_device *sdp, struct queue_limits *lim, unsigned char *buffer) { unsigned char cmd[16]; struct scsi_sense_hdr sshdr; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, }; int sense_valid = 0; int the_result; int retries = 3, reset_retries = READ_CAPACITY_RETRIES_ON_RESET; unsigned int alignment; unsigned long long lba; unsigned sector_size; if (sdp->no_read_capacity_16) return -EINVAL; do { memset(cmd, 0, 16); cmd[0] = SERVICE_ACTION_IN_16; cmd[1] = SAI_READ_CAPACITY_16; cmd[13] = RC16_LEN; memset(buffer, 0, RC16_LEN); the_result = scsi_execute_cmd(sdp, cmd, REQ_OP_DRV_IN, buffer, RC16_LEN, SD_TIMEOUT, sdkp->max_retries, &exec_args); if (the_result > 0) { if (media_not_present(sdkp, &sshdr)) return -ENODEV; sense_valid = scsi_sense_valid(&sshdr); if (sense_valid && sshdr.sense_key == ILLEGAL_REQUEST && (sshdr.asc == 0x20 || sshdr.asc == 0x24) && sshdr.ascq == 0x00) /* Invalid Command Operation Code or * Invalid Field in CDB, just retry * silently with RC10 */ return -EINVAL; if (sense_valid && sshdr.sense_key == UNIT_ATTENTION && sshdr.asc == 0x29 && sshdr.ascq == 0x00) /* Device reset might occur several times, * give it one more chance */ if (--reset_retries > 0) continue; } retries--; } while (the_result && retries); if (the_result) { sd_print_result(sdkp, "Read Capacity(16) failed", the_result); read_capacity_error(sdkp, sdp, &sshdr, sense_valid, the_result); return -EINVAL; } sector_size = get_unaligned_be32(&buffer[8]); lba = get_unaligned_be64(&buffer[0]); if (sd_read_protection_type(sdkp, buffer) < 0) { sdkp->capacity = 0; return -ENODEV; } /* Logical blocks per physical block exponent */ sdkp->physical_block_size = (1 << (buffer[13] & 0xf)) * sector_size; /* RC basis */ sdkp->rc_basis = (buffer[12] >> 4) & 0x3; /* Lowest aligned logical block */ alignment = ((buffer[14] & 0x3f) << 8 | buffer[15]) * sector_size; lim->alignment_offset = alignment; if (alignment && sdkp->first_scan) sd_printk(KERN_NOTICE, sdkp, "physical block alignment offset: %u\n", alignment); if (buffer[14] & 0x80) { /* LBPME */ sdkp->lbpme = 1; if (buffer[14] & 0x40) /* LBPRZ */ sdkp->lbprz = 1; } sdkp->capacity = lba + 1; return sector_size; } static int read_capacity_10(struct scsi_disk *sdkp, struct scsi_device *sdp, unsigned char *buffer) { static const u8 cmd[10] = { READ_CAPACITY }; struct scsi_sense_hdr sshdr; struct scsi_failure failure_defs[] = { /* Do not retry Medium Not Present */ { .sense = UNIT_ATTENTION, .asc = 0x3A, .result = SAM_STAT_CHECK_CONDITION, }, { .sense = NOT_READY, .asc = 0x3A, .result = SAM_STAT_CHECK_CONDITION, }, /* Device reset might occur several times so retry a lot */ { .sense = UNIT_ATTENTION, .asc = 0x29, .allowed = READ_CAPACITY_RETRIES_ON_RESET, .result = SAM_STAT_CHECK_CONDITION, }, /* Any other error not listed above retry 3 times */ { .result = SCMD_FAILURE_RESULT_ANY, .allowed = 3, }, {} }; struct scsi_failures failures = { .failure_definitions = failure_defs, }; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, .failures = &failures, }; int sense_valid = 0; int the_result; sector_t lba; unsigned sector_size; memset(buffer, 0, 8); the_result = scsi_execute_cmd(sdp, cmd, REQ_OP_DRV_IN, buffer, 8, SD_TIMEOUT, sdkp->max_retries, &exec_args); if (the_result > 0) { sense_valid = scsi_sense_valid(&sshdr); if (media_not_present(sdkp, &sshdr)) return -ENODEV; } if (the_result) { sd_print_result(sdkp, "Read Capacity(10) failed", the_result); read_capacity_error(sdkp, sdp, &sshdr, sense_valid, the_result); return -EINVAL; } sector_size = get_unaligned_be32(&buffer[4]); lba = get_unaligned_be32(&buffer[0]); if (sdp->no_read_capacity_16 && (lba == 0xffffffff)) { /* Some buggy (usb cardreader) devices return an lba of 0xffffffff when the want to report a size of 0 (with which they really mean no media is present) */ sdkp->capacity = 0; sdkp->physical_block_size = sector_size; return sector_size; } sdkp->capacity = lba + 1; sdkp->physical_block_size = sector_size; return sector_size; } static int sd_try_rc16_first(struct scsi_device *sdp) { if (sdp->host->max_cmd_len < 16) return 0; if (sdp->try_rc_10_first) return 0; if (sdp->scsi_level > SCSI_SPC_2) return 1; if (scsi_device_protection(sdp)) return 1; return 0; } /* * read disk capacity */ static void sd_read_capacity(struct scsi_disk *sdkp, struct queue_limits *lim, unsigned char *buffer) { int sector_size; struct scsi_device *sdp = sdkp->device; if (sd_try_rc16_first(sdp)) { sector_size = read_capacity_16(sdkp, sdp, lim, buffer); if (sector_size == -EOVERFLOW) goto got_data; if (sector_size == -ENODEV) return; if (sector_size < 0) sector_size = read_capacity_10(sdkp, sdp, buffer); if (sector_size < 0) return; } else { sector_size = read_capacity_10(sdkp, sdp, buffer); if (sector_size == -EOVERFLOW) goto got_data; if (sector_size < 0) return; if ((sizeof(sdkp->capacity) > 4) && (sdkp->capacity > 0xffffffffULL)) { int old_sector_size = sector_size; sd_printk(KERN_NOTICE, sdkp, "Very big device. " "Trying to use READ CAPACITY(16).\n"); sector_size = read_capacity_16(sdkp, sdp, lim, buffer); if (sector_size < 0) { sd_printk(KERN_NOTICE, sdkp, "Using 0xffffffff as device size\n"); sdkp->capacity = 1 + (sector_t) 0xffffffff; sector_size = old_sector_size; goto got_data; } /* Remember that READ CAPACITY(16) succeeded */ sdp->try_rc_10_first = 0; } } /* Some devices are known to return the total number of blocks, * not the highest block number. Some devices have versions * which do this and others which do not. Some devices we might * suspect of doing this but we don't know for certain. * * If we know the reported capacity is wrong, decrement it. If * we can only guess, then assume the number of blocks is even * (usually true but not always) and err on the side of lowering * the capacity. */ if (sdp->fix_capacity || (sdp->guess_capacity && (sdkp->capacity & 0x01))) { sd_printk(KERN_INFO, sdkp, "Adjusting the sector count " "from its reported value: %llu\n", (unsigned long long) sdkp->capacity); --sdkp->capacity; } got_data: if (sector_size == 0) { sector_size = 512; sd_printk(KERN_NOTICE, sdkp, "Sector size 0 reported, " "assuming 512.\n"); } if (sector_size != 512 && sector_size != 1024 && sector_size != 2048 && sector_size != 4096) { sd_printk(KERN_NOTICE, sdkp, "Unsupported sector size %d.\n", sector_size); /* * The user might want to re-format the drive with * a supported sectorsize. Once this happens, it * would be relatively trivial to set the thing up. * For this reason, we leave the thing in the table. */ sdkp->capacity = 0; /* * set a bogus sector size so the normal read/write * logic in the block layer will eventually refuse any * request on this device without tripping over power * of two sector size assumptions */ sector_size = 512; } lim->logical_block_size = sector_size; lim->physical_block_size = sdkp->physical_block_size; sdkp->device->sector_size = sector_size; if (sdkp->capacity > 0xffffffff) sdp->use_16_for_rw = 1; } /* * Print disk capacity */ static void sd_print_capacity(struct scsi_disk *sdkp, sector_t old_capacity) { int sector_size = sdkp->device->sector_size; char cap_str_2[10], cap_str_10[10]; if (!sdkp->first_scan && old_capacity == sdkp->capacity) return; string_get_size(sdkp->capacity, sector_size, STRING_UNITS_2, cap_str_2, sizeof(cap_str_2)); string_get_size(sdkp->capacity, sector_size, STRING_UNITS_10, cap_str_10, sizeof(cap_str_10)); sd_printk(KERN_NOTICE, sdkp, "%llu %d-byte logical blocks: (%s/%s)\n", (unsigned long long)sdkp->capacity, sector_size, cap_str_10, cap_str_2); if (sdkp->physical_block_size != sector_size) sd_printk(KERN_NOTICE, sdkp, "%u-byte physical blocks\n", sdkp->physical_block_size); } /* called with buffer of length 512 */ static inline int sd_do_mode_sense(struct scsi_disk *sdkp, int dbd, int modepage, unsigned char *buffer, int len, struct scsi_mode_data *data, struct scsi_sense_hdr *sshdr) { /* * If we must use MODE SENSE(10), make sure that the buffer length * is at least 8 bytes so that the mode sense header fits. */ if (sdkp->device->use_10_for_ms && len < 8) len = 8; return scsi_mode_sense(sdkp->device, dbd, modepage, 0, buffer, len, SD_TIMEOUT, sdkp->max_retries, data, sshdr); } /* * read write protect setting, if possible - called only in sd_revalidate_disk() * called with buffer of length SD_BUF_SIZE */ static void sd_read_write_protect_flag(struct scsi_disk *sdkp, unsigned char *buffer) { int res; struct scsi_device *sdp = sdkp->device; struct scsi_mode_data data; int old_wp = sdkp->write_prot; set_disk_ro(sdkp->disk, 0); if (sdp->skip_ms_page_3f) { sd_first_printk(KERN_NOTICE, sdkp, "Assuming Write Enabled\n"); return; } if (sdp->use_192_bytes_for_3f) { res = sd_do_mode_sense(sdkp, 0, 0x3F, buffer, 192, &data, NULL); } else { /* * First attempt: ask for all pages (0x3F), but only 4 bytes. * We have to start carefully: some devices hang if we ask * for more than is available. */ res = sd_do_mode_sense(sdkp, 0, 0x3F, buffer, 4, &data, NULL); /* * Second attempt: ask for page 0 When only page 0 is * implemented, a request for page 3F may return Sense Key * 5: Illegal Request, Sense Code 24: Invalid field in * CDB. */ if (res < 0) res = sd_do_mode_sense(sdkp, 0, 0, buffer, 4, &data, NULL); /* * Third attempt: ask 255 bytes, as we did earlier. */ if (res < 0) res = sd_do_mode_sense(sdkp, 0, 0x3F, buffer, 255, &data, NULL); } if (res < 0) { sd_first_printk(KERN_WARNING, sdkp, "Test WP failed, assume Write Enabled\n"); } else { sdkp->write_prot = ((data.device_specific & 0x80) != 0); set_disk_ro(sdkp->disk, sdkp->write_prot); if (sdkp->first_scan || old_wp != sdkp->write_prot) { sd_printk(KERN_NOTICE, sdkp, "Write Protect is %s\n", sdkp->write_prot ? "on" : "off"); sd_printk(KERN_DEBUG, sdkp, "Mode Sense: %4ph\n", buffer); } } } /* * sd_read_cache_type - called only from sd_revalidate_disk() * called with buffer of length SD_BUF_SIZE */ static void sd_read_cache_type(struct scsi_disk *sdkp, unsigned char *buffer) { int len = 0, res; struct scsi_device *sdp = sdkp->device; int dbd; int modepage; int first_len; struct scsi_mode_data data; struct scsi_sense_hdr sshdr; int old_wce = sdkp->WCE; int old_rcd = sdkp->RCD; int old_dpofua = sdkp->DPOFUA; if (sdkp->cache_override) return; first_len = 4; if (sdp->skip_ms_page_8) { if (sdp->type == TYPE_RBC) goto defaults; else { if (sdp->skip_ms_page_3f) goto defaults; modepage = 0x3F; if (sdp->use_192_bytes_for_3f) first_len = 192; dbd = 0; } } else if (sdp->type == TYPE_RBC) { modepage = 6; dbd = 8; } else { modepage = 8; dbd = 0; } /* cautiously ask */ res = sd_do_mode_sense(sdkp, dbd, modepage, buffer, first_len, &data, &sshdr); if (res < 0) goto bad_sense; if (!data.header_length) { modepage = 6; first_len = 0; sd_first_printk(KERN_ERR, sdkp, "Missing header in MODE_SENSE response\n"); } /* that went OK, now ask for the proper length */ len = data.length; /* * We're only interested in the first three bytes, actually. * But the data cache page is defined for the first 20. */ if (len < 3) goto bad_sense; else if (len > SD_BUF_SIZE) { sd_first_printk(KERN_NOTICE, sdkp, "Truncating mode parameter " "data from %d to %d bytes\n", len, SD_BUF_SIZE); len = SD_BUF_SIZE; } if (modepage == 0x3F && sdp->use_192_bytes_for_3f) len = 192; /* Get the data */ if (len > first_len) res = sd_do_mode_sense(sdkp, dbd, modepage, buffer, len, &data, &sshdr); if (!res) { int offset = data.header_length + data.block_descriptor_length; while (offset < len) { u8 page_code = buffer[offset] & 0x3F; u8 spf = buffer[offset] & 0x40; if (page_code == 8 || page_code == 6) { /* We're interested only in the first 3 bytes. */ if (len - offset <= 2) { sd_first_printk(KERN_ERR, sdkp, "Incomplete mode parameter " "data\n"); goto defaults; } else { modepage = page_code; goto Page_found; } } else { /* Go to the next page */ if (spf && len - offset > 3) offset += 4 + (buffer[offset+2] << 8) + buffer[offset+3]; else if (!spf && len - offset > 1) offset += 2 + buffer[offset+1]; else { sd_first_printk(KERN_ERR, sdkp, "Incomplete mode " "parameter data\n"); goto defaults; } } } sd_first_printk(KERN_WARNING, sdkp, "No Caching mode page found\n"); goto defaults; Page_found: if (modepage == 8) { sdkp->WCE = ((buffer[offset + 2] & 0x04) != 0); sdkp->RCD = ((buffer[offset + 2] & 0x01) != 0); } else { sdkp->WCE = ((buffer[offset + 2] & 0x01) == 0); sdkp->RCD = 0; } sdkp->DPOFUA = (data.device_specific & 0x10) != 0; if (sdp->broken_fua) { sd_first_printk(KERN_NOTICE, sdkp, "Disabling FUA\n"); sdkp->DPOFUA = 0; } else if (sdkp->DPOFUA && !sdkp->device->use_10_for_rw && !sdkp->device->use_16_for_rw) { sd_first_printk(KERN_NOTICE, sdkp, "Uses READ/WRITE(6), disabling FUA\n"); sdkp->DPOFUA = 0; } /* No cache flush allowed for write protected devices */ if (sdkp->WCE && sdkp->write_prot) sdkp->WCE = 0; if (sdkp->first_scan || old_wce != sdkp->WCE || old_rcd != sdkp->RCD || old_dpofua != sdkp->DPOFUA) sd_printk(KERN_NOTICE, sdkp, "Write cache: %s, read cache: %s, %s\n", sdkp->WCE ? "enabled" : "disabled", sdkp->RCD ? "disabled" : "enabled", sdkp->DPOFUA ? "supports DPO and FUA" : "doesn't support DPO or FUA"); return; } bad_sense: if (res == -EIO && scsi_sense_valid(&sshdr) && sshdr.sense_key == ILLEGAL_REQUEST && sshdr.asc == 0x24 && sshdr.ascq == 0x0) /* Invalid field in CDB */ sd_first_printk(KERN_NOTICE, sdkp, "Cache data unavailable\n"); else sd_first_printk(KERN_ERR, sdkp, "Asking for cache data failed\n"); defaults: if (sdp->wce_default_on) { sd_first_printk(KERN_NOTICE, sdkp, "Assuming drive cache: write back\n"); sdkp->WCE = 1; } else { sd_first_printk(KERN_WARNING, sdkp, "Assuming drive cache: write through\n"); sdkp->WCE = 0; } sdkp->RCD = 0; sdkp->DPOFUA = 0; } static bool sd_is_perm_stream(struct scsi_disk *sdkp, unsigned int stream_id) { u8 cdb[16] = { SERVICE_ACTION_IN_16, SAI_GET_STREAM_STATUS }; struct { struct scsi_stream_status_header h; struct scsi_stream_status s; } buf; struct scsi_device *sdev = sdkp->device; struct scsi_sense_hdr sshdr; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, }; int res; put_unaligned_be16(stream_id, &cdb[4]); put_unaligned_be32(sizeof(buf), &cdb[10]); res = scsi_execute_cmd(sdev, cdb, REQ_OP_DRV_IN, &buf, sizeof(buf), SD_TIMEOUT, sdkp->max_retries, &exec_args); if (res < 0) return false; if (scsi_status_is_check_condition(res) && scsi_sense_valid(&sshdr)) sd_print_sense_hdr(sdkp, &sshdr); if (res) return false; if (get_unaligned_be32(&buf.h.len) < sizeof(struct scsi_stream_status)) return false; return buf.h.stream_status[0].perm; } static void sd_read_io_hints(struct scsi_disk *sdkp, unsigned char *buffer) { struct scsi_device *sdp = sdkp->device; const struct scsi_io_group_descriptor *desc, *start, *end; u16 permanent_stream_count_old; struct scsi_sense_hdr sshdr; struct scsi_mode_data data; int res; if (sdp->sdev_bflags & BLIST_SKIP_IO_HINTS) return; res = scsi_mode_sense(sdp, /*dbd=*/0x8, /*modepage=*/0x0a, /*subpage=*/0x05, buffer, SD_BUF_SIZE, SD_TIMEOUT, sdkp->max_retries, &data, &sshdr); if (res < 0) return; start = (void *)buffer + data.header_length + 16; end = (void *)buffer + ALIGN_DOWN(data.header_length + data.length, sizeof(*end)); /* * From "SBC-5 Constrained Streams with Data Lifetimes": Device severs * should assign the lowest numbered stream identifiers to permanent * streams. */ for (desc = start; desc < end; desc++) if (!desc->st_enble || !sd_is_perm_stream(sdkp, desc - start)) break; permanent_stream_count_old = sdkp->permanent_stream_count; sdkp->permanent_stream_count = desc - start; if (sdkp->rscs && sdkp->permanent_stream_count < 2) sd_printk(KERN_INFO, sdkp, "Unexpected: RSCS has been set and the permanent stream count is %u\n", sdkp->permanent_stream_count); else if (sdkp->permanent_stream_count != permanent_stream_count_old) sd_printk(KERN_INFO, sdkp, "permanent stream count = %d\n", sdkp->permanent_stream_count); } /* * The ATO bit indicates whether the DIF application tag is available * for use by the operating system. */ static void sd_read_app_tag_own(struct scsi_disk *sdkp, unsigned char *buffer) { int res, offset; struct scsi_device *sdp = sdkp->device; struct scsi_mode_data data; struct scsi_sense_hdr sshdr; if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC) return; if (sdkp->protection_type == 0) return; res = scsi_mode_sense(sdp, 1, 0x0a, 0, buffer, 36, SD_TIMEOUT, sdkp->max_retries, &data, &sshdr); if (res < 0 || !data.header_length || data.length < 6) { sd_first_printk(KERN_WARNING, sdkp, "getting Control mode page failed, assume no ATO\n"); if (res == -EIO && scsi_sense_valid(&sshdr)) sd_print_sense_hdr(sdkp, &sshdr); return; } offset = data.header_length + data.block_descriptor_length; if ((buffer[offset] & 0x3f) != 0x0a) { sd_first_printk(KERN_ERR, sdkp, "ATO Got wrong page\n"); return; } if ((buffer[offset + 5] & 0x80) == 0) return; sdkp->ATO = 1; return; } static unsigned int sd_discard_mode(struct scsi_disk *sdkp) { if (!sdkp->lbpme) return SD_LBP_FULL; if (!sdkp->lbpvpd) { /* LBP VPD page not provided */ if (sdkp->max_unmap_blocks) return SD_LBP_UNMAP; return SD_LBP_WS16; } /* LBP VPD page tells us what to use */ if (sdkp->lbpu && sdkp->max_unmap_blocks) return SD_LBP_UNMAP; if (sdkp->lbpws) return SD_LBP_WS16; if (sdkp->lbpws10) return SD_LBP_WS10; return SD_LBP_DISABLE; } /* * Query disk device for preferred I/O sizes. */ static void sd_read_block_limits(struct scsi_disk *sdkp, struct queue_limits *lim) { struct scsi_vpd *vpd; rcu_read_lock(); vpd = rcu_dereference(sdkp->device->vpd_pgb0); if (!vpd || vpd->len < 16) goto out; sdkp->min_xfer_blocks = get_unaligned_be16(&vpd->data[6]); sdkp->max_xfer_blocks = get_unaligned_be32(&vpd->data[8]); sdkp->opt_xfer_blocks = get_unaligned_be32(&vpd->data[12]); if (vpd->len >= 64) { unsigned int lba_count, desc_count; sdkp->max_ws_blocks = (u32)get_unaligned_be64(&vpd->data[36]); if (!sdkp->lbpme) goto config_atomic; lba_count = get_unaligned_be32(&vpd->data[20]); desc_count = get_unaligned_be32(&vpd->data[24]); if (lba_count && desc_count) sdkp->max_unmap_blocks = lba_count; sdkp->unmap_granularity = get_unaligned_be32(&vpd->data[28]); if (vpd->data[32] & 0x80) sdkp->unmap_alignment = get_unaligned_be32(&vpd->data[32]) & ~(1 << 31); config_atomic: sdkp->max_atomic = get_unaligned_be32(&vpd->data[44]); sdkp->atomic_alignment = get_unaligned_be32(&vpd->data[48]); sdkp->atomic_granularity = get_unaligned_be32(&vpd->data[52]); sdkp->max_atomic_with_boundary = get_unaligned_be32(&vpd->data[56]); sdkp->max_atomic_boundary = get_unaligned_be32(&vpd->data[60]); sd_config_atomic(sdkp, lim); } out: rcu_read_unlock(); } /* Parse the Block Limits Extension VPD page (0xb7) */ static void sd_read_block_limits_ext(struct scsi_disk *sdkp) { struct scsi_vpd *vpd; rcu_read_lock(); vpd = rcu_dereference(sdkp->device->vpd_pgb7); if (vpd && vpd->len >= 2) sdkp->rscs = vpd->data[5] & 1; rcu_read_unlock(); } /* Query block device characteristics */ static void sd_read_block_characteristics(struct scsi_disk *sdkp, struct queue_limits *lim) { struct scsi_vpd *vpd; u16 rot; rcu_read_lock(); vpd = rcu_dereference(sdkp->device->vpd_pgb1); if (!vpd || vpd->len <= 8) { rcu_read_unlock(); return; } rot = get_unaligned_be16(&vpd->data[4]); sdkp->zoned = (vpd->data[8] >> 4) & 3; rcu_read_unlock(); if (rot == 1) lim->features &= ~(BLK_FEAT_ROTATIONAL | BLK_FEAT_ADD_RANDOM); if (!sdkp->first_scan) return; if (sdkp->device->type == TYPE_ZBC) sd_printk(KERN_NOTICE, sdkp, "Host-managed zoned block device\n"); else if (sdkp->zoned == 1) sd_printk(KERN_NOTICE, sdkp, "Host-aware SMR disk used as regular disk\n"); else if (sdkp->zoned == 2) sd_printk(KERN_NOTICE, sdkp, "Drive-managed SMR disk\n"); } /** * sd_read_block_provisioning - Query provisioning VPD page * @sdkp: disk to query */ static void sd_read_block_provisioning(struct scsi_disk *sdkp) { struct scsi_vpd *vpd; if (sdkp->lbpme == 0) return; rcu_read_lock(); vpd = rcu_dereference(sdkp->device->vpd_pgb2); if (!vpd || vpd->len < 8) { rcu_read_unlock(); return; } sdkp->lbpvpd = 1; sdkp->lbpu = (vpd->data[5] >> 7) & 1; /* UNMAP */ sdkp->lbpws = (vpd->data[5] >> 6) & 1; /* WRITE SAME(16) w/ UNMAP */ sdkp->lbpws10 = (vpd->data[5] >> 5) & 1; /* WRITE SAME(10) w/ UNMAP */ rcu_read_unlock(); } static void sd_read_write_same(struct scsi_disk *sdkp, unsigned char *buffer) { struct scsi_device *sdev = sdkp->device; if (sdev->host->no_write_same) { sdev->no_write_same = 1; return; } if (scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, INQUIRY, 0) < 0) { struct scsi_vpd *vpd; sdev->no_report_opcodes = 1; /* Disable WRITE SAME if REPORT SUPPORTED OPERATION * CODES is unsupported and the device has an ATA * Information VPD page (SAT). */ rcu_read_lock(); vpd = rcu_dereference(sdev->vpd_pg89); if (vpd) sdev->no_write_same = 1; rcu_read_unlock(); } if (scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, WRITE_SAME_16, 0) == 1) sdkp->ws16 = 1; if (scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, WRITE_SAME, 0) == 1) sdkp->ws10 = 1; } static void sd_read_security(struct scsi_disk *sdkp, unsigned char *buffer) { struct scsi_device *sdev = sdkp->device; if (!sdev->security_supported) return; if (scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, SECURITY_PROTOCOL_IN, 0) == 1 && scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, SECURITY_PROTOCOL_OUT, 0) == 1) sdkp->security = 1; } static inline sector_t sd64_to_sectors(struct scsi_disk *sdkp, u8 *buf) { return logical_to_sectors(sdkp->device, get_unaligned_be64(buf)); } /** * sd_read_cpr - Query concurrent positioning ranges * @sdkp: disk to query */ static void sd_read_cpr(struct scsi_disk *sdkp) { struct blk_independent_access_ranges *iars = NULL; unsigned char *buffer = NULL; unsigned int nr_cpr = 0; int i, vpd_len, buf_len = SD_BUF_SIZE; u8 *desc; /* * We need to have the capacity set first for the block layer to be * able to check the ranges. */ if (sdkp->first_scan) return; if (!sdkp->capacity) goto out; /* * Concurrent Positioning Ranges VPD: there can be at most 256 ranges, * leading to a maximum page size of 64 + 256*32 bytes. */ buf_len = 64 + 256*32; buffer = kmalloc(buf_len, GFP_KERNEL); if (!buffer || scsi_get_vpd_page(sdkp->device, 0xb9, buffer, buf_len)) goto out; /* We must have at least a 64B header and one 32B range descriptor */ vpd_len = get_unaligned_be16(&buffer[2]) + 4; if (vpd_len > buf_len || vpd_len < 64 + 32 || (vpd_len & 31)) { sd_printk(KERN_ERR, sdkp, "Invalid Concurrent Positioning Ranges VPD page\n"); goto out; } nr_cpr = (vpd_len - 64) / 32; if (nr_cpr == 1) { nr_cpr = 0; goto out; } iars = disk_alloc_independent_access_ranges(sdkp->disk, nr_cpr); if (!iars) { nr_cpr = 0; goto out; } desc = &buffer[64]; for (i = 0; i < nr_cpr; i++, desc += 32) { if (desc[0] != i) { sd_printk(KERN_ERR, sdkp, "Invalid Concurrent Positioning Range number\n"); nr_cpr = 0; break; } iars->ia_range[i].sector = sd64_to_sectors(sdkp, desc + 8); iars->ia_range[i].nr_sectors = sd64_to_sectors(sdkp, desc + 16); } out: disk_set_independent_access_ranges(sdkp->disk, iars); if (nr_cpr && sdkp->nr_actuators != nr_cpr) { sd_printk(KERN_NOTICE, sdkp, "%u concurrent positioning ranges\n", nr_cpr); sdkp->nr_actuators = nr_cpr; } kfree(buffer); } static bool sd_validate_min_xfer_size(struct scsi_disk *sdkp) { struct scsi_device *sdp = sdkp->device; unsigned int min_xfer_bytes = logical_to_bytes(sdp, sdkp->min_xfer_blocks); if (sdkp->min_xfer_blocks == 0) return false; if (min_xfer_bytes & (sdkp->physical_block_size - 1)) { sd_first_printk(KERN_WARNING, sdkp, "Preferred minimum I/O size %u bytes not a " \ "multiple of physical block size (%u bytes)\n", min_xfer_bytes, sdkp->physical_block_size); sdkp->min_xfer_blocks = 0; return false; } sd_first_printk(KERN_INFO, sdkp, "Preferred minimum I/O size %u bytes\n", min_xfer_bytes); return true; } /* * Determine the device's preferred I/O size for reads and writes * unless the reported value is unreasonably small, large, not a * multiple of the physical block size, or simply garbage. */ static bool sd_validate_opt_xfer_size(struct scsi_disk *sdkp, unsigned int dev_max) { struct scsi_device *sdp = sdkp->device; unsigned int opt_xfer_bytes = logical_to_bytes(sdp, sdkp->opt_xfer_blocks); unsigned int min_xfer_bytes = logical_to_bytes(sdp, sdkp->min_xfer_blocks); if (sdkp->opt_xfer_blocks == 0) return false; if (sdkp->opt_xfer_blocks > dev_max) { sd_first_printk(KERN_WARNING, sdkp, "Optimal transfer size %u logical blocks " \ "> dev_max (%u logical blocks)\n", sdkp->opt_xfer_blocks, dev_max); return false; } if (sdkp->opt_xfer_blocks > SD_DEF_XFER_BLOCKS) { sd_first_printk(KERN_WARNING, sdkp, "Optimal transfer size %u logical blocks " \ "> sd driver limit (%u logical blocks)\n", sdkp->opt_xfer_blocks, SD_DEF_XFER_BLOCKS); return false; } if (opt_xfer_bytes < PAGE_SIZE) { sd_first_printk(KERN_WARNING, sdkp, "Optimal transfer size %u bytes < " \ "PAGE_SIZE (%u bytes)\n", opt_xfer_bytes, (unsigned int)PAGE_SIZE); return false; } if (min_xfer_bytes && opt_xfer_bytes % min_xfer_bytes) { sd_first_printk(KERN_WARNING, sdkp, "Optimal transfer size %u bytes not a " \ "multiple of preferred minimum block " \ "size (%u bytes)\n", opt_xfer_bytes, min_xfer_bytes); return false; } if (opt_xfer_bytes & (sdkp->physical_block_size - 1)) { sd_first_printk(KERN_WARNING, sdkp, "Optimal transfer size %u bytes not a " \ "multiple of physical block size (%u bytes)\n", opt_xfer_bytes, sdkp->physical_block_size); return false; } sd_first_printk(KERN_INFO, sdkp, "Optimal transfer size %u bytes\n", opt_xfer_bytes); return true; } static void sd_read_block_zero(struct scsi_disk *sdkp) { struct scsi_device *sdev = sdkp->device; unsigned int buf_len = sdev->sector_size; u8 *buffer, cmd[16] = { }; buffer = kmalloc(buf_len, GFP_KERNEL); if (!buffer) return; if (sdev->use_16_for_rw) { cmd[0] = READ_16; put_unaligned_be64(0, &cmd[2]); /* Logical block address 0 */ put_unaligned_be32(1, &cmd[10]);/* Transfer 1 logical block */ } else { cmd[0] = READ_10; put_unaligned_be32(0, &cmd[2]); /* Logical block address 0 */ put_unaligned_be16(1, &cmd[7]); /* Transfer 1 logical block */ } scsi_execute_cmd(sdkp->device, cmd, REQ_OP_DRV_IN, buffer, buf_len, SD_TIMEOUT, sdkp->max_retries, NULL); kfree(buffer); } /** * sd_revalidate_disk - called the first time a new disk is seen, * performs disk spin up, read_capacity, etc. * @disk: struct gendisk we care about **/ static int sd_revalidate_disk(struct gendisk *disk) { struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdp = sdkp->device; sector_t old_capacity = sdkp->capacity; struct queue_limits lim; unsigned char *buffer; unsigned int dev_max; int err; SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_revalidate_disk\n")); /* * If the device is offline, don't try and read capacity or any * of the other niceties. */ if (!scsi_device_online(sdp)) goto out; buffer = kmalloc(SD_BUF_SIZE, GFP_KERNEL); if (!buffer) { sd_printk(KERN_WARNING, sdkp, "sd_revalidate_disk: Memory " "allocation failure.\n"); goto out; } sd_spinup_disk(sdkp); lim = queue_limits_start_update(sdkp->disk->queue); /* * Without media there is no reason to ask; moreover, some devices * react badly if we do. */ if (sdkp->media_present) { sd_read_capacity(sdkp, &lim, buffer); /* * Some USB/UAS devices return generic values for mode pages * until the media has been accessed. Trigger a READ operation * to force the device to populate mode pages. */ if (sdp->read_before_ms) sd_read_block_zero(sdkp); /* * set the default to rotational. All non-rotational devices * support the block characteristics VPD page, which will * cause this to be updated correctly and any device which * doesn't support it should be treated as rotational. */ lim.features |= (BLK_FEAT_ROTATIONAL | BLK_FEAT_ADD_RANDOM); if (scsi_device_supports_vpd(sdp)) { sd_read_block_provisioning(sdkp); sd_read_block_limits(sdkp, &lim); sd_read_block_limits_ext(sdkp); sd_read_block_characteristics(sdkp, &lim); sd_zbc_read_zones(sdkp, &lim, buffer); } sd_config_discard(sdkp, &lim, sd_discard_mode(sdkp)); sd_print_capacity(sdkp, old_capacity); sd_read_write_protect_flag(sdkp, buffer); sd_read_cache_type(sdkp, buffer); sd_read_io_hints(sdkp, buffer); sd_read_app_tag_own(sdkp, buffer); sd_read_write_same(sdkp, buffer); sd_read_security(sdkp, buffer); sd_config_protection(sdkp, &lim); } /* * We now have all cache related info, determine how we deal * with flush requests. */ sd_set_flush_flag(sdkp, &lim); /* Initial block count limit based on CDB TRANSFER LENGTH field size. */ dev_max = sdp->use_16_for_rw ? SD_MAX_XFER_BLOCKS : SD_DEF_XFER_BLOCKS; /* Some devices report a maximum block count for READ/WRITE requests. */ dev_max = min_not_zero(dev_max, sdkp->max_xfer_blocks); lim.max_dev_sectors = logical_to_sectors(sdp, dev_max); if (sd_validate_min_xfer_size(sdkp)) lim.io_min = logical_to_bytes(sdp, sdkp->min_xfer_blocks); else lim.io_min = 0; /* * Limit default to SCSI host optimal sector limit if set. There may be * an impact on performance for when the size of a request exceeds this * host limit. */ lim.io_opt = sdp->host->opt_sectors << SECTOR_SHIFT; if (sd_validate_opt_xfer_size(sdkp, dev_max)) { lim.io_opt = min_not_zero(lim.io_opt, logical_to_bytes(sdp, sdkp->opt_xfer_blocks)); } sdkp->first_scan = 0; set_capacity_and_notify(disk, logical_to_sectors(sdp, sdkp->capacity)); sd_config_write_same(sdkp, &lim); kfree(buffer); blk_mq_freeze_queue(sdkp->disk->queue); err = queue_limits_commit_update(sdkp->disk->queue, &lim); blk_mq_unfreeze_queue(sdkp->disk->queue); if (err) return err; /* * Query concurrent positioning ranges after * queue_limits_commit_update() unlocked q->limits_lock to avoid * deadlock with q->sysfs_dir_lock and q->sysfs_lock. */ if (sdkp->media_present && scsi_device_supports_vpd(sdp)) sd_read_cpr(sdkp); /* * For a zoned drive, revalidating the zones can be done only once * the gendisk capacity is set. So if this fails, set back the gendisk * capacity to 0. */ if (sd_zbc_revalidate_zones(sdkp)) set_capacity_and_notify(disk, 0); out: return 0; } /** * sd_unlock_native_capacity - unlock native capacity * @disk: struct gendisk to set capacity for * * Block layer calls this function if it detects that partitions * on @disk reach beyond the end of the device. If the SCSI host * implements ->unlock_native_capacity() method, it's invoked to * give it a chance to adjust the device capacity. * * CONTEXT: * Defined by block layer. Might sleep. */ static void sd_unlock_native_capacity(struct gendisk *disk) { struct scsi_device *sdev = scsi_disk(disk)->device; if (sdev->host->hostt->unlock_native_capacity) sdev->host->hostt->unlock_native_capacity(sdev); } /** * sd_format_disk_name - format disk name * @prefix: name prefix - ie. "sd" for SCSI disks * @index: index of the disk to format name for * @buf: output buffer * @buflen: length of the output buffer * * SCSI disk names starts at sda. The 26th device is sdz and the * 27th is sdaa. The last one for two lettered suffix is sdzz * which is followed by sdaaa. * * This is basically 26 base counting with one extra 'nil' entry * at the beginning from the second digit on and can be * determined using similar method as 26 base conversion with the * index shifted -1 after each digit is computed. * * CONTEXT: * Don't care. * * RETURNS: * 0 on success, -errno on failure. */ static int sd_format_disk_name(char *prefix, int index, char *buf, int buflen) { const int base = 'z' - 'a' + 1; char *begin = buf + strlen(prefix); char *end = buf + buflen; char *p; int unit; p = end - 1; *p = '\0'; unit = base; do { if (p == begin) return -EINVAL; *--p = 'a' + (index % unit); index = (index / unit) - 1; } while (index >= 0); memmove(begin, p, end - p); memcpy(buf, prefix, strlen(prefix)); return 0; } /** * sd_probe - called during driver initialization and whenever a * new scsi device is attached to the system. It is called once * for each scsi device (not just disks) present. * @dev: pointer to device object * * Returns 0 if successful (or not interested in this scsi device * (e.g. scanner)); 1 when there is an error. * * Note: this function is invoked from the scsi mid-level. * This function sets up the mapping between a given * <host,channel,id,lun> (found in sdp) and new device name * (e.g. /dev/sda). More precisely it is the block device major * and minor number that is chosen here. * * Assume sd_probe is not re-entrant (for time being) * Also think about sd_probe() and sd_remove() running coincidentally. **/ static int sd_probe(struct device *dev) { struct scsi_device *sdp = to_scsi_device(dev); struct scsi_disk *sdkp; struct gendisk *gd; int index; int error; scsi_autopm_get_device(sdp); error = -ENODEV; if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC && sdp->type != TYPE_MOD && sdp->type != TYPE_RBC) goto out; if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) && sdp->type == TYPE_ZBC) { sdev_printk(KERN_WARNING, sdp, "Unsupported ZBC host-managed device.\n"); goto out; } SCSI_LOG_HLQUEUE(3, sdev_printk(KERN_INFO, sdp, "sd_probe\n")); error = -ENOMEM; sdkp = kzalloc(sizeof(*sdkp), GFP_KERNEL); if (!sdkp) goto out; gd = blk_mq_alloc_disk_for_queue(sdp->request_queue, &sd_bio_compl_lkclass); if (!gd) goto out_free; index = ida_alloc(&sd_index_ida, GFP_KERNEL); if (index < 0) { sdev_printk(KERN_WARNING, sdp, "sd_probe: memory exhausted.\n"); goto out_put; } error = sd_format_disk_name("sd", index, gd->disk_name, DISK_NAME_LEN); if (error) { sdev_printk(KERN_WARNING, sdp, "SCSI disk (sd) name length exceeded.\n"); goto out_free_index; } sdkp->device = sdp; sdkp->disk = gd; sdkp->index = index; sdkp->max_retries = SD_MAX_RETRIES; atomic_set(&sdkp->openers, 0); atomic_set(&sdkp->device->ioerr_cnt, 0); if (!sdp->request_queue->rq_timeout) { if (sdp->type != TYPE_MOD) blk_queue_rq_timeout(sdp->request_queue, SD_TIMEOUT); else blk_queue_rq_timeout(sdp->request_queue, SD_MOD_TIMEOUT); } device_initialize(&sdkp->disk_dev); sdkp->disk_dev.parent = get_device(dev); sdkp->disk_dev.class = &sd_disk_class; dev_set_name(&sdkp->disk_dev, "%s", dev_name(dev)); error = device_add(&sdkp->disk_dev); if (error) { put_device(&sdkp->disk_dev); goto out; } dev_set_drvdata(dev, sdkp); gd->major = sd_major((index & 0xf0) >> 4); gd->first_minor = ((index & 0xf) << 4) | (index & 0xfff00); gd->minors = SD_MINORS; gd->fops = &sd_fops; gd->private_data = sdkp; /* defaults, until the device tells us otherwise */ sdp->sector_size = 512; sdkp->capacity = 0; sdkp->media_present = 1; sdkp->write_prot = 0; sdkp->cache_override = 0; sdkp->WCE = 0; sdkp->RCD = 0; sdkp->ATO = 0; sdkp->first_scan = 1; sdkp->max_medium_access_timeouts = SD_MAX_MEDIUM_TIMEOUTS; sd_revalidate_disk(gd); if (sdp->removable) { gd->flags |= GENHD_FL_REMOVABLE; gd->events |= DISK_EVENT_MEDIA_CHANGE; gd->event_flags = DISK_EVENT_FLAG_POLL | DISK_EVENT_FLAG_UEVENT; } blk_pm_runtime_init(sdp->request_queue, dev); if (sdp->rpm_autosuspend) { pm_runtime_set_autosuspend_delay(dev, sdp->host->rpm_autosuspend_delay); } error = device_add_disk(dev, gd, NULL); if (error) { device_unregister(&sdkp->disk_dev); put_disk(gd); goto out; } if (sdkp->security) { sdkp->opal_dev = init_opal_dev(sdkp, &sd_sec_submit); if (sdkp->opal_dev) sd_printk(KERN_NOTICE, sdkp, "supports TCG Opal\n"); } sd_printk(KERN_NOTICE, sdkp, "Attached SCSI %sdisk\n", sdp->removable ? "removable " : ""); scsi_autopm_put_device(sdp); return 0; out_free_index: ida_free(&sd_index_ida, index); out_put: put_disk(gd); out_free: kfree(sdkp); out: scsi_autopm_put_device(sdp); return error; } /** * sd_remove - called whenever a scsi disk (previously recognized by * sd_probe) is detached from the system. It is called (potentially * multiple times) during sd module unload. * @dev: pointer to device object * * Note: this function is invoked from the scsi mid-level. * This function potentially frees up a device name (e.g. /dev/sdc) * that could be re-used by a subsequent sd_probe(). * This function is not called when the built-in sd driver is "exit-ed". **/ static int sd_remove(struct device *dev) { struct scsi_disk *sdkp = dev_get_drvdata(dev); scsi_autopm_get_device(sdkp->device); device_del(&sdkp->disk_dev); del_gendisk(sdkp->disk); if (!sdkp->suspended) sd_shutdown(dev); put_disk(sdkp->disk); return 0; } static void scsi_disk_release(struct device *dev) { struct scsi_disk *sdkp = to_scsi_disk(dev); ida_free(&sd_index_ida, sdkp->index); put_device(&sdkp->device->sdev_gendev); free_opal_dev(sdkp->opal_dev); kfree(sdkp); } static int sd_start_stop_device(struct scsi_disk *sdkp, int start) { unsigned char cmd[6] = { START_STOP }; /* START_VALID */ struct scsi_sense_hdr sshdr; struct scsi_failure failure_defs[] = { { /* Power on, reset, or bus device reset occurred */ .sense = UNIT_ATTENTION, .asc = 0x29, .ascq = 0, .result = SAM_STAT_CHECK_CONDITION, }, { /* Power on occurred */ .sense = UNIT_ATTENTION, .asc = 0x29, .ascq = 1, .result = SAM_STAT_CHECK_CONDITION, }, { /* SCSI bus reset */ .sense = UNIT_ATTENTION, .asc = 0x29, .ascq = 2, .result = SAM_STAT_CHECK_CONDITION, }, {} }; struct scsi_failures failures = { .total_allowed = 3, .failure_definitions = failure_defs, }; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, .req_flags = BLK_MQ_REQ_PM, .failures = &failures, }; struct scsi_device *sdp = sdkp->device; int res; if (start) cmd[4] |= 1; /* START */ if (sdp->start_stop_pwr_cond) cmd[4] |= start ? 1 << 4 : 3 << 4; /* Active or Standby */ if (!scsi_device_online(sdp)) return -ENODEV; res = scsi_execute_cmd(sdp, cmd, REQ_OP_DRV_IN, NULL, 0, SD_TIMEOUT, sdkp->max_retries, &exec_args); if (res) { sd_print_result(sdkp, "Start/Stop Unit failed", res); if (res > 0 && scsi_sense_valid(&sshdr)) { sd_print_sense_hdr(sdkp, &sshdr); /* 0x3a is medium not present */ if (sshdr.asc == 0x3a) res = 0; } } /* SCSI error codes must not go to the generic layer */ if (res) return -EIO; return 0; } /* * Send a SYNCHRONIZE CACHE instruction down to the device through * the normal SCSI command structure. Wait for the command to * complete. */ static void sd_shutdown(struct device *dev) { struct scsi_disk *sdkp = dev_get_drvdata(dev); if (!sdkp) return; /* this can happen */ if (pm_runtime_suspended(dev)) return; if (sdkp->WCE && sdkp->media_present) { sd_printk(KERN_NOTICE, sdkp, "Synchronizing SCSI cache\n"); sd_sync_cache(sdkp); } if ((system_state != SYSTEM_RESTART && sdkp->device->manage_system_start_stop) || (system_state == SYSTEM_POWER_OFF && sdkp->device->manage_shutdown)) { sd_printk(KERN_NOTICE, sdkp, "Stopping disk\n"); sd_start_stop_device(sdkp, 0); } } static inline bool sd_do_start_stop(struct scsi_device *sdev, bool runtime) { return (sdev->manage_system_start_stop && !runtime) || (sdev->manage_runtime_start_stop && runtime); } static int sd_suspend_common(struct device *dev, bool runtime) { struct scsi_disk *sdkp = dev_get_drvdata(dev); int ret = 0; if (!sdkp) /* E.g.: runtime suspend following sd_remove() */ return 0; if (sdkp->WCE && sdkp->media_present) { if (!sdkp->device->silence_suspend) sd_printk(KERN_NOTICE, sdkp, "Synchronizing SCSI cache\n"); ret = sd_sync_cache(sdkp); /* ignore OFFLINE device */ if (ret == -ENODEV) return 0; if (ret) return ret; } if (sd_do_start_stop(sdkp->device, runtime)) { if (!sdkp->device->silence_suspend) sd_printk(KERN_NOTICE, sdkp, "Stopping disk\n"); /* an error is not worth aborting a system sleep */ ret = sd_start_stop_device(sdkp, 0); if (!runtime) ret = 0; } if (!ret) sdkp->suspended = true; return ret; } static int sd_suspend_system(struct device *dev) { if (pm_runtime_suspended(dev)) return 0; return sd_suspend_common(dev, false); } static int sd_suspend_runtime(struct device *dev) { return sd_suspend_common(dev, true); } static int sd_resume(struct device *dev) { struct scsi_disk *sdkp = dev_get_drvdata(dev); sd_printk(KERN_NOTICE, sdkp, "Starting disk\n"); if (opal_unlock_from_suspend(sdkp->opal_dev)) { sd_printk(KERN_NOTICE, sdkp, "OPAL unlock failed\n"); return -EIO; } return 0; } static int sd_resume_common(struct device *dev, bool runtime) { struct scsi_disk *sdkp = dev_get_drvdata(dev); int ret; if (!sdkp) /* E.g.: runtime resume at the start of sd_probe() */ return 0; if (!sd_do_start_stop(sdkp->device, runtime)) { sdkp->suspended = false; return 0; } sd_printk(KERN_NOTICE, sdkp, "Starting disk\n"); ret = sd_start_stop_device(sdkp, 1); if (!ret) { sd_resume(dev); sdkp->suspended = false; } return ret; } static int sd_resume_system(struct device *dev) { if (pm_runtime_suspended(dev)) { struct scsi_disk *sdkp = dev_get_drvdata(dev); struct scsi_device *sdp = sdkp ? sdkp->device : NULL; if (sdp && sdp->force_runtime_start_on_system_start) pm_request_resume(dev); return 0; } return sd_resume_common(dev, false); } static int sd_resume_runtime(struct device *dev) { struct scsi_disk *sdkp = dev_get_drvdata(dev); struct scsi_device *sdp; if (!sdkp) /* E.g.: runtime resume at the start of sd_probe() */ return 0; sdp = sdkp->device; if (sdp->ignore_media_change) { /* clear the device's sense data */ static const u8 cmd[10] = { REQUEST_SENSE }; const struct scsi_exec_args exec_args = { .req_flags = BLK_MQ_REQ_PM, }; if (scsi_execute_cmd(sdp, cmd, REQ_OP_DRV_IN, NULL, 0, sdp->request_queue->rq_timeout, 1, &exec_args)) sd_printk(KERN_NOTICE, sdkp, "Failed to clear sense data\n"); } return sd_resume_common(dev, true); } static const struct dev_pm_ops sd_pm_ops = { .suspend = sd_suspend_system, .resume = sd_resume_system, .poweroff = sd_suspend_system, .restore = sd_resume_system, .runtime_suspend = sd_suspend_runtime, .runtime_resume = sd_resume_runtime, }; static struct scsi_driver sd_template = { .gendrv = { .name = "sd", .probe = sd_probe, .probe_type = PROBE_PREFER_ASYNCHRONOUS, .remove = sd_remove, .shutdown = sd_shutdown, .pm = &sd_pm_ops, }, .rescan = sd_rescan, .resume = sd_resume, .init_command = sd_init_command, .uninit_command = sd_uninit_command, .done = sd_done, .eh_action = sd_eh_action, .eh_reset = sd_eh_reset, }; /** * init_sd - entry point for this driver (both when built in or when * a module). * * Note: this function registers this driver with the scsi mid-level. **/ static int __init init_sd(void) { int majors = 0, i, err; SCSI_LOG_HLQUEUE(3, printk("init_sd: sd driver entry point\n")); for (i = 0; i < SD_MAJORS; i++) { if (__register_blkdev(sd_major(i), "sd", sd_default_probe)) continue; majors++; } if (!majors) return -ENODEV; err = class_register(&sd_disk_class); if (err) goto err_out; sd_page_pool = mempool_create_page_pool(SD_MEMPOOL_SIZE, 0); if (!sd_page_pool) { printk(KERN_ERR "sd: can't init discard page pool\n"); err = -ENOMEM; goto err_out_class; } err = scsi_register_driver(&sd_template.gendrv); if (err) goto err_out_driver; return 0; err_out_driver: mempool_destroy(sd_page_pool); err_out_class: class_unregister(&sd_disk_class); err_out: for (i = 0; i < SD_MAJORS; i++) unregister_blkdev(sd_major(i), "sd"); return err; } /** * exit_sd - exit point for this driver (when it is a module). * * Note: this function unregisters this driver from the scsi mid-level. **/ static void __exit exit_sd(void) { int i; SCSI_LOG_HLQUEUE(3, printk("exit_sd: exiting sd driver\n")); scsi_unregister_driver(&sd_template.gendrv); mempool_destroy(sd_page_pool); class_unregister(&sd_disk_class); for (i = 0; i < SD_MAJORS; i++) unregister_blkdev(sd_major(i), "sd"); } module_init(init_sd); module_exit(exit_sd); void sd_print_sense_hdr(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr) { scsi_print_sense_hdr(sdkp->device, sdkp->disk ? sdkp->disk->disk_name : NULL, sshdr); } void sd_print_result(const struct scsi_disk *sdkp, const char *msg, int result) { const char *hb_string = scsi_hostbyte_string(result); if (hb_string) sd_printk(KERN_INFO, sdkp, "%s: Result: hostbyte=%s driverbyte=%s\n", msg, hb_string ? hb_string : "invalid", "DRIVER_OK"); else sd_printk(KERN_INFO, sdkp, "%s: Result: hostbyte=0x%02x driverbyte=%s\n", msg, host_byte(result), "DRIVER_OK"); } |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 | // SPDX-License-Identifier: GPL-2.0 /* * HugeTLB Vmemmap Optimization (HVO) * * Copyright (c) 2020, ByteDance. All rights reserved. * * Author: Muchun Song <songmuchun@bytedance.com> */ #ifndef _LINUX_HUGETLB_VMEMMAP_H #define _LINUX_HUGETLB_VMEMMAP_H #include <linux/hugetlb.h> /* * Reserve one vmemmap page, all vmemmap addresses are mapped to it. See * Documentation/mm/vmemmap_dedup.rst. */ #define HUGETLB_VMEMMAP_RESERVE_SIZE PAGE_SIZE #define HUGETLB_VMEMMAP_RESERVE_PAGES (HUGETLB_VMEMMAP_RESERVE_SIZE / sizeof(struct page)) #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio); long hugetlb_vmemmap_restore_folios(const struct hstate *h, struct list_head *folio_list, struct list_head *non_hvo_folios); void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio); void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list); static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h) { return pages_per_huge_page(h) * sizeof(struct page); } /* * Return how many vmemmap size associated with a HugeTLB page that can be * optimized and can be freed to the buddy allocator. */ static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h) { int size = hugetlb_vmemmap_size(h) - HUGETLB_VMEMMAP_RESERVE_SIZE; if (!is_power_of_2(sizeof(struct page))) return 0; return size > 0 ? size : 0; } #else static inline int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) { return 0; } static long hugetlb_vmemmap_restore_folios(const struct hstate *h, struct list_head *folio_list, struct list_head *non_hvo_folios) { list_splice_init(folio_list, non_hvo_folios); return 0; } static inline void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) { } static inline void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) { } static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h) { return 0; } #endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */ static inline bool hugetlb_vmemmap_optimizable(const struct hstate *h) { return hugetlb_vmemmap_optimizable_size(h) != 0; } #endif /* _LINUX_HUGETLB_VMEMMAP_H */ |
| 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hpfs/alloc.c * * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 * * HPFS bitmap operations */ #include "hpfs_fn.h" static void hpfs_claim_alloc(struct super_block *s, secno sec) { struct hpfs_sb_info *sbi = hpfs_sb(s); if (sbi->sb_n_free != (unsigned)-1) { if (unlikely(!sbi->sb_n_free)) { hpfs_error(s, "free count underflow, allocating sector %08x", sec); sbi->sb_n_free = -1; return; } sbi->sb_n_free--; } } static void hpfs_claim_free(struct super_block *s, secno sec) { struct hpfs_sb_info *sbi = hpfs_sb(s); if (sbi->sb_n_free != (unsigned)-1) { if (unlikely(sbi->sb_n_free >= sbi->sb_fs_size)) { hpfs_error(s, "free count overflow, freeing sector %08x", sec); sbi->sb_n_free = -1; return; } sbi->sb_n_free++; } } static void hpfs_claim_dirband_alloc(struct super_block *s, secno sec) { struct hpfs_sb_info *sbi = hpfs_sb(s); if (sbi->sb_n_free_dnodes != (unsigned)-1) { if (unlikely(!sbi->sb_n_free_dnodes)) { hpfs_error(s, "dirband free count underflow, allocating sector %08x", sec); sbi->sb_n_free_dnodes = -1; return; } sbi->sb_n_free_dnodes--; } } static void hpfs_claim_dirband_free(struct super_block *s, secno sec) { struct hpfs_sb_info *sbi = hpfs_sb(s); if (sbi->sb_n_free_dnodes != (unsigned)-1) { if (unlikely(sbi->sb_n_free_dnodes >= sbi->sb_dirband_size / 4)) { hpfs_error(s, "dirband free count overflow, freeing sector %08x", sec); sbi->sb_n_free_dnodes = -1; return; } sbi->sb_n_free_dnodes++; } } /* * Check if a sector is allocated in bitmap * This is really slow. Turned on only if chk==2 */ static int chk_if_allocated(struct super_block *s, secno sec, char *msg) { struct quad_buffer_head qbh; __le32 *bmp; if (!(bmp = hpfs_map_bitmap(s, sec >> 14, &qbh, "chk"))) goto fail; if ((le32_to_cpu(bmp[(sec & 0x3fff) >> 5]) >> (sec & 0x1f)) & 1) { hpfs_error(s, "sector '%s' - %08x not allocated in bitmap", msg, sec); goto fail1; } hpfs_brelse4(&qbh); if (sec >= hpfs_sb(s)->sb_dirband_start && sec < hpfs_sb(s)->sb_dirband_start + hpfs_sb(s)->sb_dirband_size) { unsigned ssec = (sec - hpfs_sb(s)->sb_dirband_start) / 4; if (!(bmp = hpfs_map_dnode_bitmap(s, &qbh))) goto fail; if ((le32_to_cpu(bmp[ssec >> 5]) >> (ssec & 0x1f)) & 1) { hpfs_error(s, "sector '%s' - %08x not allocated in directory bitmap", msg, sec); goto fail1; } hpfs_brelse4(&qbh); } return 0; fail1: hpfs_brelse4(&qbh); fail: return 1; } /* * Check if sector(s) have proper number and additionally check if they're * allocated in bitmap. */ int hpfs_chk_sectors(struct super_block *s, secno start, int len, char *msg) { if (start + len < start || start < 0x12 || start + len > hpfs_sb(s)->sb_fs_size) { hpfs_error(s, "sector(s) '%s' badly placed at %08x", msg, start); return 1; } if (hpfs_sb(s)->sb_chk>=2) { int i; for (i = 0; i < len; i++) if (chk_if_allocated(s, start + i, msg)) return 1; } return 0; } static secno alloc_in_bmp(struct super_block *s, secno near, unsigned n, unsigned forward) { struct quad_buffer_head qbh; __le32 *bmp; unsigned bs = near & ~0x3fff; unsigned nr = (near & 0x3fff) & ~(n - 1); /*unsigned mnr;*/ unsigned i, q; int a, b; secno ret = 0; if (n != 1 && n != 4) { hpfs_error(s, "Bad allocation size: %d", n); return 0; } if (bs != ~0x3fff) { if (!(bmp = hpfs_map_bitmap(s, near >> 14, &qbh, "aib"))) goto uls; } else { if (!(bmp = hpfs_map_dnode_bitmap(s, &qbh))) goto uls; } if (!tstbits(bmp, nr, n + forward)) { ret = bs + nr; goto rt; } q = nr + n; b = 0; while ((a = tstbits(bmp, q, n + forward)) != 0) { q += a; if (n != 1) q = ((q-1)&~(n-1))+n; if (!b) { if (q>>5 != nr>>5) { b = 1; q = nr & 0x1f; } } else if (q > nr) break; } if (!a) { ret = bs + q; goto rt; } nr >>= 5; /*for (i = nr + 1; i != nr; i++, i &= 0x1ff) */ i = nr; do { if (!le32_to_cpu(bmp[i])) goto cont; if (n + forward >= 0x3f && le32_to_cpu(bmp[i]) != 0xffffffff) goto cont; q = i<<5; if (i > 0) { unsigned k = le32_to_cpu(bmp[i-1]); while (k & 0x80000000) { q--; k <<= 1; } } if (n != 1) q = ((q-1)&~(n-1))+n; while ((a = tstbits(bmp, q, n + forward)) != 0) { q += a; if (n != 1) q = ((q-1)&~(n-1))+n; if (q>>5 > i) break; } if (!a) { ret = bs + q; goto rt; } cont: i++, i &= 0x1ff; } while (i != nr); rt: if (ret) { if (hpfs_sb(s)->sb_chk && ((ret >> 14) != (bs >> 14) || (le32_to_cpu(bmp[(ret & 0x3fff) >> 5]) | ~(((1 << n) - 1) << (ret & 0x1f))) != 0xffffffff)) { hpfs_error(s, "Allocation doesn't work! Wanted %d, allocated at %08x", n, ret); ret = 0; goto b; } bmp[(ret & 0x3fff) >> 5] &= cpu_to_le32(~(((1 << n) - 1) << (ret & 0x1f))); hpfs_mark_4buffers_dirty(&qbh); } b: hpfs_brelse4(&qbh); uls: return ret; } /* * Allocation strategy: 1) search place near the sector specified * 2) search bitmap where free sectors last found * 3) search all bitmaps * 4) search all bitmaps ignoring number of pre-allocated * sectors */ secno hpfs_alloc_sector(struct super_block *s, secno near, unsigned n, int forward) { secno sec; int i; unsigned n_bmps; struct hpfs_sb_info *sbi = hpfs_sb(s); int f_p = 0; int near_bmp; if (forward < 0) { forward = -forward; f_p = 1; } n_bmps = (sbi->sb_fs_size + 0x4000 - 1) >> 14; if (near && near < sbi->sb_fs_size) { if ((sec = alloc_in_bmp(s, near, n, f_p ? forward : forward/4))) goto ret; near_bmp = near >> 14; } else near_bmp = n_bmps / 2; /* if (b != -1) { if ((sec = alloc_in_bmp(s, b<<14, n, f_p ? forward : forward/2))) { b &= 0x0fffffff; goto ret; } if (b > 0x10000000) if ((sec = alloc_in_bmp(s, (b&0xfffffff)<<14, n, f_p ? forward : 0))) goto ret; */ if (!f_p) if (forward > sbi->sb_max_fwd_alloc) forward = sbi->sb_max_fwd_alloc; less_fwd: for (i = 0; i < n_bmps; i++) { if (near_bmp+i < n_bmps && ((sec = alloc_in_bmp(s, (near_bmp+i) << 14, n, forward)))) { sbi->sb_c_bitmap = near_bmp+i; goto ret; } if (!forward) { if (near_bmp-i-1 >= 0 && ((sec = alloc_in_bmp(s, (near_bmp-i-1) << 14, n, forward)))) { sbi->sb_c_bitmap = near_bmp-i-1; goto ret; } } else { if (near_bmp+i >= n_bmps && ((sec = alloc_in_bmp(s, (near_bmp+i-n_bmps) << 14, n, forward)))) { sbi->sb_c_bitmap = near_bmp+i-n_bmps; goto ret; } } if (i == 1 && sbi->sb_c_bitmap != -1 && ((sec = alloc_in_bmp(s, (sbi->sb_c_bitmap) << 14, n, forward)))) { goto ret; } } if (!f_p) { if (forward) { sbi->sb_max_fwd_alloc = forward * 3 / 4; forward /= 2; goto less_fwd; } } sec = 0; ret: if (sec) { i = 0; do hpfs_claim_alloc(s, sec + i); while (unlikely(++i < n)); } if (sec && f_p) { for (i = 0; i < forward; i++) { if (!hpfs_alloc_if_possible(s, sec + n + i)) { hpfs_error(s, "Prealloc doesn't work! Wanted %d, allocated at %08x, can't allocate %d", forward, sec, i); sec = 0; break; } } } return sec; } static secno alloc_in_dirband(struct super_block *s, secno near) { unsigned nr = near; secno sec; struct hpfs_sb_info *sbi = hpfs_sb(s); if (nr < sbi->sb_dirband_start) nr = sbi->sb_dirband_start; if (nr >= sbi->sb_dirband_start + sbi->sb_dirband_size) nr = sbi->sb_dirband_start + sbi->sb_dirband_size - 4; nr -= sbi->sb_dirband_start; nr >>= 2; sec = alloc_in_bmp(s, (~0x3fff) | nr, 1, 0); if (!sec) return 0; hpfs_claim_dirband_alloc(s, sec); return ((sec & 0x3fff) << 2) + sbi->sb_dirband_start; } /* Alloc sector if it's free */ int hpfs_alloc_if_possible(struct super_block *s, secno sec) { struct quad_buffer_head qbh; __le32 *bmp; if (!(bmp = hpfs_map_bitmap(s, sec >> 14, &qbh, "aip"))) goto end; if (le32_to_cpu(bmp[(sec & 0x3fff) >> 5]) & (1 << (sec & 0x1f))) { bmp[(sec & 0x3fff) >> 5] &= cpu_to_le32(~(1 << (sec & 0x1f))); hpfs_mark_4buffers_dirty(&qbh); hpfs_brelse4(&qbh); hpfs_claim_alloc(s, sec); return 1; } hpfs_brelse4(&qbh); end: return 0; } /* Free sectors in bitmaps */ void hpfs_free_sectors(struct super_block *s, secno sec, unsigned n) { struct quad_buffer_head qbh; __le32 *bmp; struct hpfs_sb_info *sbi = hpfs_sb(s); /*pr_info("2 - ");*/ if (!n) return; if (sec < 0x12) { hpfs_error(s, "Trying to free reserved sector %08x", sec); return; } sbi->sb_max_fwd_alloc += n > 0xffff ? 0xffff : n; if (sbi->sb_max_fwd_alloc > 0xffffff) sbi->sb_max_fwd_alloc = 0xffffff; new_map: if (!(bmp = hpfs_map_bitmap(s, sec >> 14, &qbh, "free"))) { return; } new_tst: if ((le32_to_cpu(bmp[(sec & 0x3fff) >> 5]) >> (sec & 0x1f) & 1)) { hpfs_error(s, "sector %08x not allocated", sec); hpfs_brelse4(&qbh); return; } bmp[(sec & 0x3fff) >> 5] |= cpu_to_le32(1 << (sec & 0x1f)); hpfs_claim_free(s, sec); if (!--n) { hpfs_mark_4buffers_dirty(&qbh); hpfs_brelse4(&qbh); return; } if (!(++sec & 0x3fff)) { hpfs_mark_4buffers_dirty(&qbh); hpfs_brelse4(&qbh); goto new_map; } goto new_tst; } /* * Check if there are at least n free dnodes on the filesystem. * Called before adding to dnode. If we run out of space while * splitting dnodes, it would corrupt dnode tree. */ int hpfs_check_free_dnodes(struct super_block *s, int n) { int n_bmps = (hpfs_sb(s)->sb_fs_size + 0x4000 - 1) >> 14; int b = hpfs_sb(s)->sb_c_bitmap & 0x0fffffff; int i, j; __le32 *bmp; struct quad_buffer_head qbh; if ((bmp = hpfs_map_dnode_bitmap(s, &qbh))) { for (j = 0; j < 512; j++) { unsigned k; if (!le32_to_cpu(bmp[j])) continue; for (k = le32_to_cpu(bmp[j]); k; k >>= 1) if (k & 1) if (!--n) { hpfs_brelse4(&qbh); return 0; } } } hpfs_brelse4(&qbh); i = 0; if (hpfs_sb(s)->sb_c_bitmap != -1) { bmp = hpfs_map_bitmap(s, b, &qbh, "chkdn1"); goto chk_bmp; } chk_next: if (i == b) i++; if (i >= n_bmps) return 1; bmp = hpfs_map_bitmap(s, i, &qbh, "chkdn2"); chk_bmp: if (bmp) { for (j = 0; j < 512; j++) { u32 k; if (!le32_to_cpu(bmp[j])) continue; for (k = 0xf; k; k <<= 4) if ((le32_to_cpu(bmp[j]) & k) == k) { if (!--n) { hpfs_brelse4(&qbh); return 0; } } } hpfs_brelse4(&qbh); } i++; goto chk_next; } void hpfs_free_dnode(struct super_block *s, dnode_secno dno) { if (hpfs_sb(s)->sb_chk) if (dno & 3) { hpfs_error(s, "hpfs_free_dnode: dnode %08x not aligned", dno); return; } if (dno < hpfs_sb(s)->sb_dirband_start || dno >= hpfs_sb(s)->sb_dirband_start + hpfs_sb(s)->sb_dirband_size) { hpfs_free_sectors(s, dno, 4); } else { struct quad_buffer_head qbh; __le32 *bmp; unsigned ssec = (dno - hpfs_sb(s)->sb_dirband_start) / 4; if (!(bmp = hpfs_map_dnode_bitmap(s, &qbh))) { return; } bmp[ssec >> 5] |= cpu_to_le32(1 << (ssec & 0x1f)); hpfs_mark_4buffers_dirty(&qbh); hpfs_brelse4(&qbh); hpfs_claim_dirband_free(s, dno); } } struct dnode *hpfs_alloc_dnode(struct super_block *s, secno near, dnode_secno *dno, struct quad_buffer_head *qbh) { struct dnode *d; if (hpfs_get_free_dnodes(s) > FREE_DNODES_ADD) { if (!(*dno = alloc_in_dirband(s, near))) if (!(*dno = hpfs_alloc_sector(s, near, 4, 0))) return NULL; } else { if (!(*dno = hpfs_alloc_sector(s, near, 4, 0))) if (!(*dno = alloc_in_dirband(s, near))) return NULL; } if (!(d = hpfs_get_4sectors(s, *dno, qbh))) { hpfs_free_dnode(s, *dno); return NULL; } memset(d, 0, 2048); d->magic = cpu_to_le32(DNODE_MAGIC); d->first_free = cpu_to_le32(52); d->dirent[0] = 32; d->dirent[2] = 8; d->dirent[30] = 1; d->dirent[31] = 255; d->self = cpu_to_le32(*dno); return d; } struct fnode *hpfs_alloc_fnode(struct super_block *s, secno near, fnode_secno *fno, struct buffer_head **bh) { struct fnode *f; if (!(*fno = hpfs_alloc_sector(s, near, 1, FNODE_ALLOC_FWD))) return NULL; if (!(f = hpfs_get_sector(s, *fno, bh))) { hpfs_free_sectors(s, *fno, 1); return NULL; } memset(f, 0, 512); f->magic = cpu_to_le32(FNODE_MAGIC); f->ea_offs = cpu_to_le16(0xc4); f->btree.n_free_nodes = 8; f->btree.first_free = cpu_to_le16(8); return f; } struct anode *hpfs_alloc_anode(struct super_block *s, secno near, anode_secno *ano, struct buffer_head **bh) { struct anode *a; if (!(*ano = hpfs_alloc_sector(s, near, 1, ANODE_ALLOC_FWD))) return NULL; if (!(a = hpfs_get_sector(s, *ano, bh))) { hpfs_free_sectors(s, *ano, 1); return NULL; } memset(a, 0, 512); a->magic = cpu_to_le32(ANODE_MAGIC); a->self = cpu_to_le32(*ano); a->btree.n_free_nodes = 40; a->btree.n_used_nodes = 0; a->btree.first_free = cpu_to_le16(8); return a; } static unsigned find_run(__le32 *bmp, unsigned *idx) { unsigned len; while (tstbits(bmp, *idx, 1)) { (*idx)++; if (unlikely(*idx >= 0x4000)) return 0; } len = 1; while (!tstbits(bmp, *idx + len, 1)) len++; return len; } static int do_trim(struct super_block *s, secno start, unsigned len, secno limit_start, secno limit_end, unsigned minlen, unsigned *result) { int err; secno end; if (fatal_signal_pending(current)) return -EINTR; end = start + len; if (start < limit_start) start = limit_start; if (end > limit_end) end = limit_end; if (start >= end) return 0; if (end - start < minlen) return 0; err = sb_issue_discard(s, start, end - start, GFP_NOFS, 0); if (err) return err; *result += end - start; return 0; } int hpfs_trim_fs(struct super_block *s, u64 start, u64 end, u64 minlen, unsigned *result) { int err = 0; struct hpfs_sb_info *sbi = hpfs_sb(s); unsigned idx, len, start_bmp, end_bmp; __le32 *bmp; struct quad_buffer_head qbh; *result = 0; if (!end || end > sbi->sb_fs_size) end = sbi->sb_fs_size; if (start >= sbi->sb_fs_size) return 0; if (minlen > 0x4000) return 0; if (start < sbi->sb_dirband_start + sbi->sb_dirband_size && end > sbi->sb_dirband_start) { hpfs_lock(s); if (sb_rdonly(s)) { err = -EROFS; goto unlock_1; } if (!(bmp = hpfs_map_dnode_bitmap(s, &qbh))) { err = -EIO; goto unlock_1; } idx = 0; while ((len = find_run(bmp, &idx)) && !err) { err = do_trim(s, sbi->sb_dirband_start + idx * 4, len * 4, start, end, minlen, result); idx += len; } hpfs_brelse4(&qbh); unlock_1: hpfs_unlock(s); } start_bmp = start >> 14; end_bmp = (end + 0x3fff) >> 14; while (start_bmp < end_bmp && !err) { hpfs_lock(s); if (sb_rdonly(s)) { err = -EROFS; goto unlock_2; } if (!(bmp = hpfs_map_bitmap(s, start_bmp, &qbh, "trim"))) { err = -EIO; goto unlock_2; } idx = 0; while ((len = find_run(bmp, &idx)) && !err) { err = do_trim(s, (start_bmp << 14) + idx, len, start, end, minlen, result); idx += len; } hpfs_brelse4(&qbh); unlock_2: hpfs_unlock(s); start_bmp++; } return err; } |
| 3 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 | // SPDX-License-Identifier: GPL-2.0-or-later /* * TCP over IPv6 * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Based on: * linux/net/ipv4/tcp.c * linux/net/ipv4/tcp_input.c * linux/net/ipv4/tcp_output.c * * Fixes: * Hideaki YOSHIFUJI : sin6_scope_id support * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind * a single port at the same time. * YOSHIFUJI Hideaki @USAGI: convert /proc/net/tcp6 to seq_file. */ #include <linux/bottom_half.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/jiffies.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/init.h> #include <linux/jhash.h> #include <linux/ipsec.h> #include <linux/times.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/random.h> #include <linux/indirect_call_wrapper.h> #include <net/tcp.h> #include <net/ndisc.h> #include <net/inet6_hashtables.h> #include <net/inet6_connection_sock.h> #include <net/ipv6.h> #include <net/transp_v6.h> #include <net/addrconf.h> #include <net/ip6_route.h> #include <net/ip6_checksum.h> #include <net/inet_ecn.h> #include <net/protocol.h> #include <net/xfrm.h> #include <net/snmp.h> #include <net/dsfield.h> #include <net/timewait_sock.h> #include <net/inet_common.h> #include <net/secure_seq.h> #include <net/hotdata.h> #include <net/busy_poll.h> #include <net/rstreason.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <crypto/hash.h> #include <linux/scatterlist.h> #include <trace/events/tcp.h> static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, enum sk_rst_reason reason); static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req); INDIRECT_CALLABLE_SCOPE int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); static const struct inet_connection_sock_af_ops ipv6_mapped; const struct inet_connection_sock_af_ops ipv6_specific; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) static const struct tcp_sock_af_ops tcp_sock_ipv6_specific; static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific; #endif /* Helper returning the inet6 address from a given tcp socket. * It can be used in TCP stack instead of inet6_sk(sk). * This avoids a dereference and allow compiler optimizations. * It is a specialized version of inet6_sk_generic(). */ #define tcp_inet6_sk(sk) (&container_of_const(tcp_sk(sk), \ struct tcp6_sock, tcp)->inet6) static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); if (dst && dst_hold_safe(dst)) { rcu_assign_pointer(sk->sk_rx_dst, dst); sk->sk_rx_dst_ifindex = skb->skb_iif; sk->sk_rx_dst_cookie = rt6_get_cookie(dst_rt6_info(dst)); } } static u32 tcp_v6_init_seq(const struct sk_buff *skb) { return secure_tcpv6_seq(ipv6_hdr(skb)->daddr.s6_addr32, ipv6_hdr(skb)->saddr.s6_addr32, tcp_hdr(skb)->dest, tcp_hdr(skb)->source); } static u32 tcp_v6_init_ts_off(const struct net *net, const struct sk_buff *skb) { return secure_tcpv6_ts_off(net, ipv6_hdr(skb)->daddr.s6_addr32, ipv6_hdr(skb)->saddr.s6_addr32); } static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { /* This check is replicated from tcp_v6_connect() and intended to * prevent BPF program called below from accessing bytes that are out * of the bound specified by user in addr_len. */ if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; sock_owned_by_me(sk); return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, &addr_len); } static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; struct inet_connection_sock *icsk = inet_csk(sk); struct in6_addr *saddr = NULL, *final_p, final; struct inet_timewait_death_row *tcp_death_row; struct ipv6_pinfo *np = tcp_inet6_sk(sk); struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct ipv6_txoptions *opt; struct dst_entry *dst; struct flowi6 fl6; int addr_type; int err; if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; if (usin->sin6_family != AF_INET6) return -EAFNOSUPPORT; memset(&fl6, 0, sizeof(fl6)); if (inet6_test_bit(SNDFLOW, sk)) { fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK; IP6_ECN_flow_init(fl6.flowlabel); if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { struct ip6_flowlabel *flowlabel; flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (IS_ERR(flowlabel)) return -EINVAL; fl6_sock_release(flowlabel); } } /* * connect() to INADDR_ANY means loopback (BSD'ism). */ if (ipv6_addr_any(&usin->sin6_addr)) { if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK), &usin->sin6_addr); else usin->sin6_addr = in6addr_loopback; } addr_type = ipv6_addr_type(&usin->sin6_addr); if (addr_type & IPV6_ADDR_MULTICAST) return -ENETUNREACH; if (addr_type&IPV6_ADDR_LINKLOCAL) { if (addr_len >= sizeof(struct sockaddr_in6) && usin->sin6_scope_id) { /* If interface is set while binding, indices * must coincide. */ if (!sk_dev_equal_l3scope(sk, usin->sin6_scope_id)) return -EINVAL; sk->sk_bound_dev_if = usin->sin6_scope_id; } /* Connect to link-local address requires an interface */ if (!sk->sk_bound_dev_if) return -EINVAL; } if (tp->rx_opt.ts_recent_stamp && !ipv6_addr_equal(&sk->sk_v6_daddr, &usin->sin6_addr)) { tp->rx_opt.ts_recent = 0; tp->rx_opt.ts_recent_stamp = 0; WRITE_ONCE(tp->write_seq, 0); } sk->sk_v6_daddr = usin->sin6_addr; np->flow_label = fl6.flowlabel; /* * TCP over IPv4 */ if (addr_type & IPV6_ADDR_MAPPED) { u32 exthdrlen = icsk->icsk_ext_hdr_len; struct sockaddr_in sin; if (ipv6_only_sock(sk)) return -ENETUNREACH; sin.sin_family = AF_INET; sin.sin_port = usin->sin6_port; sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ WRITE_ONCE(icsk->icsk_af_ops, &ipv6_mapped); if (sk_is_mptcp(sk)) mptcpv6_handle_mapped(sk, true); sk->sk_backlog_rcv = tcp_v4_do_rcv; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) tp->af_specific = &tcp_sock_ipv6_mapped_specific; #endif err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); if (err) { icsk->icsk_ext_hdr_len = exthdrlen; /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ WRITE_ONCE(icsk->icsk_af_ops, &ipv6_specific); if (sk_is_mptcp(sk)) mptcpv6_handle_mapped(sk, false); sk->sk_backlog_rcv = tcp_v6_do_rcv; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) tp->af_specific = &tcp_sock_ipv6_specific; #endif goto failure; } np->saddr = sk->sk_v6_rcv_saddr; return err; } if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) saddr = &sk->sk_v6_rcv_saddr; fl6.flowi6_proto = IPPROTO_TCP; fl6.daddr = sk->sk_v6_daddr; fl6.saddr = saddr ? *saddr : np->saddr; fl6.flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label); fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = usin->sin6_port; fl6.fl6_sport = inet->inet_sport; fl6.flowi6_uid = sk->sk_uid; opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); final_p = fl6_update_dst(&fl6, opt, &final); security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); dst = ip6_dst_lookup_flow(net, sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto failure; } tp->tcp_usec_ts = dst_tcp_usec_ts(dst); tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; if (!saddr) { saddr = &fl6.saddr; err = inet_bhash2_update_saddr(sk, saddr, AF_INET6); if (err) goto failure; } /* set the source address */ np->saddr = *saddr; inet->inet_rcv_saddr = LOOPBACK4_IPV6; sk->sk_gso_type = SKB_GSO_TCPV6; ip6_dst_store(sk, dst, NULL, NULL); icsk->icsk_ext_hdr_len = 0; if (opt) icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen; tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); inet->inet_dport = usin->sin6_port; tcp_set_state(sk, TCP_SYN_SENT); err = inet6_hash_connect(tcp_death_row, sk); if (err) goto late_failure; sk_set_txhash(sk); if (likely(!tp->repair)) { if (!tp->write_seq) WRITE_ONCE(tp->write_seq, secure_tcpv6_seq(np->saddr.s6_addr32, sk->sk_v6_daddr.s6_addr32, inet->inet_sport, inet->inet_dport)); tp->tsoffset = secure_tcpv6_ts_off(net, np->saddr.s6_addr32, sk->sk_v6_daddr.s6_addr32); } if (tcp_fastopen_defer_connect(sk, &err)) return err; if (err) goto late_failure; err = tcp_connect(sk); if (err) goto late_failure; return 0; late_failure: tcp_set_state(sk, TCP_CLOSE); inet_bhash2_reset_saddr(sk); failure: inet->inet_dport = 0; sk->sk_route_caps = 0; return err; } static void tcp_v6_mtu_reduced(struct sock *sk) { struct dst_entry *dst; u32 mtu; if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) return; mtu = READ_ONCE(tcp_sk(sk)->mtu_info); /* Drop requests trying to increase our current mss. * Check done in __ip6_rt_update_pmtu() is too late. */ if (tcp_mtu_to_mss(sk, mtu) >= tcp_sk(sk)->mss_cache) return; dst = inet6_csk_update_pmtu(sk, mtu); if (!dst) return; if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { tcp_sync_mss(sk, dst_mtu(dst)); tcp_simple_retransmit(sk); } } static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; const struct tcphdr *th = (struct tcphdr *)(skb->data+offset); struct net *net = dev_net(skb->dev); struct request_sock *fastopen; struct ipv6_pinfo *np; struct tcp_sock *tp; __u32 seq, snd_una; struct sock *sk; bool fatal; int err; sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, &hdr->daddr, th->dest, &hdr->saddr, ntohs(th->source), skb->dev->ifindex, inet6_sdif(skb)); if (!sk) { __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); return -ENOENT; } if (sk->sk_state == TCP_TIME_WAIT) { /* To increase the counter of ignored icmps for TCP-AO */ tcp_ao_ignore_icmp(sk, AF_INET6, type, code); inet_twsk_put(inet_twsk(sk)); return 0; } seq = ntohl(th->seq); fatal = icmpv6_err_convert(type, code, &err); if (sk->sk_state == TCP_NEW_SYN_RECV) { tcp_req_err(sk, seq, fatal); return 0; } if (tcp_ao_ignore_icmp(sk, AF_INET6, type, code)) { sock_put(sk); return 0; } bh_lock_sock(sk); if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG) __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); if (sk->sk_state == TCP_CLOSE) goto out; if (static_branch_unlikely(&ip6_min_hopcount)) { /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */ if (ipv6_hdr(skb)->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount)) { __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); goto out; } } tp = tcp_sk(sk); /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ fastopen = rcu_dereference(tp->fastopen_rsk); snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; if (sk->sk_state != TCP_LISTEN && !between(seq, snd_una, tp->snd_nxt)) { __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } np = tcp_inet6_sk(sk); if (type == NDISC_REDIRECT) { if (!sock_owned_by_user(sk)) { struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie); if (dst) dst->ops->redirect(dst, sk, skb); } goto out; } if (type == ICMPV6_PKT_TOOBIG) { u32 mtu = ntohl(info); /* We are not interested in TCP_LISTEN and open_requests * (SYN-ACKs send out by Linux are always <576bytes so * they should go through unfragmented). */ if (sk->sk_state == TCP_LISTEN) goto out; if (!ip6_sk_accept_pmtu(sk)) goto out; if (mtu < IPV6_MIN_MTU) goto out; WRITE_ONCE(tp->mtu_info, mtu); if (!sock_owned_by_user(sk)) tcp_v6_mtu_reduced(sk); else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) sock_hold(sk); goto out; } /* Might be for an request_sock */ switch (sk->sk_state) { case TCP_SYN_SENT: case TCP_SYN_RECV: /* Only in fast or simultaneous open. If a fast open socket is * already accepted it is treated as a connected one below. */ if (fastopen && !fastopen->sk) break; ipv6_icmp_error(sk, skb, err, th->dest, ntohl(info), (u8 *)th); if (!sock_owned_by_user(sk)) tcp_done_with_error(sk, err); else WRITE_ONCE(sk->sk_err_soft, err); goto out; case TCP_LISTEN: break; default: /* check if this ICMP message allows revert of backoff. * (see RFC 6069) */ if (!fastopen && type == ICMPV6_DEST_UNREACH && code == ICMPV6_NOROUTE) tcp_ld_RTO_revert(sk, seq); } if (!sock_owned_by_user(sk) && inet6_test_bit(RECVERR6, sk)) { WRITE_ONCE(sk->sk_err, err); sk_error_report(sk); } else { WRITE_ONCE(sk->sk_err_soft, err); } out: bh_unlock_sock(sk); sock_put(sk); return 0; } static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); const struct ipv6_pinfo *np = tcp_inet6_sk(sk); struct ipv6_txoptions *opt; struct flowi6 *fl6 = &fl->u.ip6; struct sk_buff *skb; int err = -ENOMEM; u8 tclass; /* First, grab a route. */ if (!dst && (dst = inet6_csk_route_req(sk, fl6, req, IPPROTO_TCP)) == NULL) goto done; skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); if (skb) { __tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr, &ireq->ir_v6_rmt_addr); fl6->daddr = ireq->ir_v6_rmt_addr; if (inet6_test_bit(REPFLOW, sk) && ireq->pktopts) fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts)); tclass = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ? (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | (np->tclass & INET_ECN_MASK) : np->tclass; if (!INET_ECN_is_capable(tclass) && tcp_bpf_ca_needs_ecn((struct sock *)req)) tclass |= INET_ECN_ECT_0; rcu_read_lock(); opt = ireq->ipv6_opt; if (!opt) opt = rcu_dereference(np->opt); err = ip6_xmit(sk, skb, fl6, skb->mark ? : READ_ONCE(sk->sk_mark), opt, tclass, READ_ONCE(sk->sk_priority)); rcu_read_unlock(); err = net_xmit_eval(err); } done: return err; } static void tcp_v6_reqsk_destructor(struct request_sock *req) { kfree(inet_rsk(req)->ipv6_opt); consume_skb(inet_rsk(req)->pktopts); } #ifdef CONFIG_TCP_MD5SIG static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk, const struct in6_addr *addr, int l3index) { return tcp_md5_do_lookup(sk, l3index, (union tcp_md5_addr *)addr, AF_INET6); } static struct tcp_md5sig_key *tcp_v6_md5_lookup(const struct sock *sk, const struct sock *addr_sk) { int l3index; l3index = l3mdev_master_ifindex_by_index(sock_net(sk), addr_sk->sk_bound_dev_if); return tcp_v6_md5_do_lookup(sk, &addr_sk->sk_v6_daddr, l3index); } static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct tcp_md5sig cmd; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.tcpm_addr; union tcp_ao_addr *addr; int l3index = 0; u8 prefixlen; bool l3flag; u8 flags; if (optlen < sizeof(cmd)) return -EINVAL; if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) return -EFAULT; if (sin6->sin6_family != AF_INET6) return -EINVAL; flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; if (optname == TCP_MD5SIG_EXT && cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { prefixlen = cmd.tcpm_prefixlen; if (prefixlen > 128 || (ipv6_addr_v4mapped(&sin6->sin6_addr) && prefixlen > 32)) return -EINVAL; } else { prefixlen = ipv6_addr_v4mapped(&sin6->sin6_addr) ? 32 : 128; } if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { struct net_device *dev; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); if (dev && netif_is_l3_master(dev)) l3index = dev->ifindex; rcu_read_unlock(); /* ok to reference set/not set outside of rcu; * right now device MUST be an L3 master */ if (!dev || !l3index) return -EINVAL; } if (!cmd.tcpm_keylen) { if (ipv6_addr_v4mapped(&sin6->sin6_addr)) return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3], AF_INET, prefixlen, l3index, flags); return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr, AF_INET6, prefixlen, l3index, flags); } if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) return -EINVAL; if (ipv6_addr_v4mapped(&sin6->sin6_addr)) { addr = (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3]; /* Don't allow keys for peers that have a matching TCP-AO key. * See the comment in tcp_ao_add_cmd() */ if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false)) return -EKEYREJECTED; return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, cmd.tcpm_key, cmd.tcpm_keylen); } addr = (union tcp_md5_addr *)&sin6->sin6_addr; /* Don't allow keys for peers that have a matching TCP-AO key. * See the comment in tcp_ao_add_cmd() */ if (tcp_ao_required(sk, addr, AF_INET6, l3flag ? l3index : -1, false)) return -EKEYREJECTED; return tcp_md5_do_add(sk, addr, AF_INET6, prefixlen, l3index, flags, cmd.tcpm_key, cmd.tcpm_keylen); } static int tcp_v6_md5_hash_headers(struct tcp_sigpool *hp, const struct in6_addr *daddr, const struct in6_addr *saddr, const struct tcphdr *th, int nbytes) { struct tcp6_pseudohdr *bp; struct scatterlist sg; struct tcphdr *_th; bp = hp->scratch; /* 1. TCP pseudo-header (RFC2460) */ bp->saddr = *saddr; bp->daddr = *daddr; bp->protocol = cpu_to_be32(IPPROTO_TCP); bp->len = cpu_to_be32(nbytes); _th = (struct tcphdr *)(bp + 1); memcpy(_th, th, sizeof(*th)); _th->check = 0; sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); ahash_request_set_crypt(hp->req, &sg, NULL, sizeof(*bp) + sizeof(*th)); return crypto_ahash_update(hp->req); } static int tcp_v6_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, const struct in6_addr *daddr, struct in6_addr *saddr, const struct tcphdr *th) { struct tcp_sigpool hp; if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) goto clear_hash_nostart; if (crypto_ahash_init(hp.req)) goto clear_hash; if (tcp_v6_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2)) goto clear_hash; if (tcp_md5_hash_key(&hp, key)) goto clear_hash; ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); if (crypto_ahash_final(hp.req)) goto clear_hash; tcp_sigpool_end(&hp); return 0; clear_hash: tcp_sigpool_end(&hp); clear_hash_nostart: memset(md5_hash, 0, 16); return 1; } static int tcp_v6_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, const struct sock *sk, const struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); const struct in6_addr *saddr, *daddr; struct tcp_sigpool hp; if (sk) { /* valid for establish/request sockets */ saddr = &sk->sk_v6_rcv_saddr; daddr = &sk->sk_v6_daddr; } else { const struct ipv6hdr *ip6h = ipv6_hdr(skb); saddr = &ip6h->saddr; daddr = &ip6h->daddr; } if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) goto clear_hash_nostart; if (crypto_ahash_init(hp.req)) goto clear_hash; if (tcp_v6_md5_hash_headers(&hp, daddr, saddr, th, skb->len)) goto clear_hash; if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2)) goto clear_hash; if (tcp_md5_hash_key(&hp, key)) goto clear_hash; ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); if (crypto_ahash_final(hp.req)) goto clear_hash; tcp_sigpool_end(&hp); return 0; clear_hash: tcp_sigpool_end(&hp); clear_hash_nostart: memset(md5_hash, 0, 16); return 1; } #endif static void tcp_v6_init_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb, u32 tw_isn) { bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags); struct inet_request_sock *ireq = inet_rsk(req); const struct ipv6_pinfo *np = tcp_inet6_sk(sk_listener); ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; /* So that link locals have meaning */ if ((!sk_listener->sk_bound_dev_if || l3_slave) && ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) ireq->ir_iif = tcp_v6_iif(skb); if (!tw_isn && (ipv6_opt_accepted(sk_listener, skb, &TCP_SKB_CB(skb)->header.h6) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim || inet6_test_bit(REPFLOW, sk_listener))) { refcount_inc(&skb->users); ireq->pktopts = skb; } } static struct dst_entry *tcp_v6_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, struct request_sock *req, u32 tw_isn) { tcp_v6_init_req(req, sk, skb, tw_isn); if (security_inet_conn_request(sk, skb, req)) return NULL; return inet6_csk_route_req(sk, &fl->u.ip6, req, IPPROTO_TCP); } struct request_sock_ops tcp6_request_sock_ops __read_mostly = { .family = AF_INET6, .obj_size = sizeof(struct tcp6_request_sock), .rtx_syn_ack = tcp_rtx_synack, .send_ack = tcp_v6_reqsk_send_ack, .destructor = tcp_v6_reqsk_destructor, .send_reset = tcp_v6_send_reset, .syn_ack_timeout = tcp_syn_ack_timeout, }; const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { .mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr), #ifdef CONFIG_TCP_MD5SIG .req_md5_lookup = tcp_v6_md5_lookup, .calc_md5_hash = tcp_v6_md5_hash_skb, #endif #ifdef CONFIG_TCP_AO .ao_lookup = tcp_v6_ao_lookup_rsk, .ao_calc_key = tcp_v6_ao_calc_key_rsk, .ao_synack_hash = tcp_v6_ao_synack_hash, #endif #ifdef CONFIG_SYN_COOKIES .cookie_init_seq = cookie_v6_init_sequence, #endif .route_req = tcp_v6_route_req, .init_seq = tcp_v6_init_seq, .init_ts_off = tcp_v6_init_ts_off, .send_synack = tcp_v6_send_synack, }; static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, int rst, u8 tclass, __be32 label, u32 priority, u32 txhash, struct tcp_key *key) { const struct tcphdr *th = tcp_hdr(skb); struct tcphdr *t1; struct sk_buff *buff; struct flowi6 fl6; struct net *net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); struct sock *ctl_sk = net->ipv6.tcp_sk; unsigned int tot_len = sizeof(struct tcphdr); __be32 mrst = 0, *topt; struct dst_entry *dst; __u32 mark = 0; if (tsecr) tot_len += TCPOLEN_TSTAMP_ALIGNED; if (tcp_key_is_md5(key)) tot_len += TCPOLEN_MD5SIG_ALIGNED; if (tcp_key_is_ao(key)) tot_len += tcp_ao_len_aligned(key->ao_key); #ifdef CONFIG_MPTCP if (rst && !tcp_key_is_md5(key)) { mrst = mptcp_reset_option(skb); if (mrst) tot_len += sizeof(__be32); } #endif buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); if (!buff) return; skb_reserve(buff, MAX_TCP_HEADER); t1 = skb_push(buff, tot_len); skb_reset_transport_header(buff); /* Swap the send and the receive. */ memset(t1, 0, sizeof(*t1)); t1->dest = th->source; t1->source = th->dest; t1->doff = tot_len / 4; t1->seq = htonl(seq); t1->ack_seq = htonl(ack); t1->ack = !rst || !th->ack; t1->rst = rst; t1->window = htons(win); topt = (__be32 *)(t1 + 1); if (tsecr) { *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); *topt++ = htonl(tsval); *topt++ = htonl(tsecr); } if (mrst) *topt++ = mrst; #ifdef CONFIG_TCP_MD5SIG if (tcp_key_is_md5(key)) { *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); tcp_v6_md5_hash_hdr((__u8 *)topt, key->md5_key, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, t1); } #endif #ifdef CONFIG_TCP_AO if (tcp_key_is_ao(key)) { *topt++ = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key->ao_key) << 16) | (key->ao_key->sndid << 8) | (key->rcv_next)); tcp_ao_hash_hdr(AF_INET6, (char *)topt, key->ao_key, key->traffic_key, (union tcp_ao_addr *)&ipv6_hdr(skb)->saddr, (union tcp_ao_addr *)&ipv6_hdr(skb)->daddr, t1, key->sne); } #endif memset(&fl6, 0, sizeof(fl6)); fl6.daddr = ipv6_hdr(skb)->saddr; fl6.saddr = ipv6_hdr(skb)->daddr; fl6.flowlabel = label; buff->ip_summed = CHECKSUM_PARTIAL; __tcp_v6_send_check(buff, &fl6.saddr, &fl6.daddr); fl6.flowi6_proto = IPPROTO_TCP; if (rt6_need_strict(&fl6.daddr) && !oif) fl6.flowi6_oif = tcp_v6_iif(skb); else { if (!oif && netif_index_is_l3_master(net, skb->skb_iif)) oif = skb->skb_iif; fl6.flowi6_oif = oif; } if (sk) { /* unconstify the socket only to attach it to buff with care. */ skb_set_owner_edemux(buff, (struct sock *)sk); if (sk->sk_state == TCP_TIME_WAIT) mark = inet_twsk(sk)->tw_mark; else mark = READ_ONCE(sk->sk_mark); skb_set_delivery_time(buff, tcp_transmit_time(sk), SKB_CLOCK_MONOTONIC); } if (txhash) { /* autoflowlabel/skb_get_hash_flowi6 rely on buff->hash */ skb_set_hash(buff, txhash, PKT_HASH_TYPE_L4); } fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark; fl6.fl6_dport = t1->dest; fl6.fl6_sport = t1->source; fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6)); /* Pass a socket to ip6_dst_lookup either it is for RST * Underlying function will use this to retrieve the network * namespace */ if (sk && sk->sk_state != TCP_TIME_WAIT) dst = ip6_dst_lookup_flow(net, sk, &fl6, NULL); /*sk's xfrm_policy can be referred*/ else dst = ip6_dst_lookup_flow(net, ctl_sk, &fl6, NULL); if (!IS_ERR(dst)) { skb_dst_set(buff, dst); ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass & ~INET_ECN_MASK, priority); TCP_INC_STATS(net, TCP_MIB_OUTSEGS); if (rst) TCP_INC_STATS(net, TCP_MIB_OUTRSTS); return; } kfree_skb(buff); } static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, enum sk_rst_reason reason) { const struct tcphdr *th = tcp_hdr(skb); struct ipv6hdr *ipv6h = ipv6_hdr(skb); const __u8 *md5_hash_location = NULL; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) bool allocated_traffic_key = false; #endif const struct tcp_ao_hdr *aoh; struct tcp_key key = {}; u32 seq = 0, ack_seq = 0; __be32 label = 0; u32 priority = 0; struct net *net; u32 txhash = 0; int oif = 0; #ifdef CONFIG_TCP_MD5SIG unsigned char newhash[16]; int genhash; struct sock *sk1 = NULL; #endif if (th->rst) return; /* If sk not NULL, it means we did a successful lookup and incoming * route had to be correct. prequeue might have dropped our dst. */ if (!sk && !ipv6_unicast_destination(skb)) return; net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); /* Invalid TCP option size or twice included auth */ if (tcp_parse_auth_options(th, &md5_hash_location, &aoh)) return; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) rcu_read_lock(); #endif #ifdef CONFIG_TCP_MD5SIG if (sk && sk_fullsock(sk)) { int l3index; /* sdif set, means packet ingressed via a device * in an L3 domain and inet_iif is set to it. */ l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0; key.md5_key = tcp_v6_md5_do_lookup(sk, &ipv6h->saddr, l3index); if (key.md5_key) key.type = TCP_KEY_MD5; } else if (md5_hash_location) { int dif = tcp_v6_iif_l3_slave(skb); int sdif = tcp_v6_sdif(skb); int l3index; /* * active side is lost. Try to find listening socket through * source port, and then find md5 key through listening socket. * we are not loose security here: * Incoming packet is checked with md5 hash with finding key, * no RST generated if md5 hash doesn't match. */ sk1 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, NULL, 0, &ipv6h->saddr, th->source, &ipv6h->daddr, ntohs(th->source), dif, sdif); if (!sk1) goto out; /* sdif set, means packet ingressed via a device * in an L3 domain and dif is set to it. */ l3index = tcp_v6_sdif(skb) ? dif : 0; key.md5_key = tcp_v6_md5_do_lookup(sk1, &ipv6h->saddr, l3index); if (!key.md5_key) goto out; key.type = TCP_KEY_MD5; genhash = tcp_v6_md5_hash_skb(newhash, key.md5_key, NULL, skb); if (genhash || memcmp(md5_hash_location, newhash, 16) != 0) goto out; } #endif if (th->ack) seq = ntohl(th->ack_seq); else ack_seq = ntohl(th->seq) + th->syn + th->fin + skb->len - (th->doff << 2); #ifdef CONFIG_TCP_AO if (aoh) { int l3index; l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0; if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, seq, &key.ao_key, &key.traffic_key, &allocated_traffic_key, &key.rcv_next, &key.sne)) goto out; key.type = TCP_KEY_AO; } #endif if (sk) { oif = sk->sk_bound_dev_if; if (sk_fullsock(sk)) { if (inet6_test_bit(REPFLOW, sk)) label = ip6_flowlabel(ipv6h); priority = READ_ONCE(sk->sk_priority); txhash = sk->sk_txhash; } if (sk->sk_state == TCP_TIME_WAIT) { label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel); priority = inet_twsk(sk)->tw_priority; txhash = inet_twsk(sk)->tw_txhash; } } else { if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_TCP_RESET) label = ip6_flowlabel(ipv6h); } trace_tcp_send_reset(sk, skb, reason); tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1, ipv6_get_dsfield(ipv6h), label, priority, txhash, &key); #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) out: if (allocated_traffic_key) kfree(key.traffic_key); rcu_read_unlock(); #endif } static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_key *key, u8 tclass, __be32 label, u32 priority, u32 txhash) { tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, 0, tclass, label, priority, txhash, key); } static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) { struct inet_timewait_sock *tw = inet_twsk(sk); struct tcp_timewait_sock *tcptw = tcp_twsk(sk); struct tcp_key key = {}; #ifdef CONFIG_TCP_AO struct tcp_ao_info *ao_info; if (static_branch_unlikely(&tcp_ao_needed.key)) { /* FIXME: the segment to-be-acked is not verified yet */ ao_info = rcu_dereference(tcptw->ao_info); if (ao_info) { const struct tcp_ao_hdr *aoh; /* Invalid TCP option size or twice included auth */ if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) goto out; if (aoh) key.ao_key = tcp_ao_established_key(sk, ao_info, aoh->rnext_keyid, -1); } } if (key.ao_key) { struct tcp_ao_key *rnext_key; key.traffic_key = snd_other_key(key.ao_key); /* rcv_next switches to our rcv_next */ rnext_key = READ_ONCE(ao_info->rnext_key); key.rcv_next = rnext_key->rcvid; key.sne = READ_ONCE(ao_info->snd_sne); key.type = TCP_KEY_AO; #else if (0) { #endif #ifdef CONFIG_TCP_MD5SIG } else if (static_branch_unlikely(&tcp_md5_needed.key)) { key.md5_key = tcp_twsk_md5_key(tcptw); if (key.md5_key) key.type = TCP_KEY_MD5; #endif } tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt), tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_tw_tsval(tcptw), READ_ONCE(tcptw->tw_ts_recent), tw->tw_bound_dev_if, &key, tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), tw->tw_priority, tw->tw_txhash); #ifdef CONFIG_TCP_AO out: #endif inet_twsk_put(tw); } static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { struct tcp_key key = {}; #ifdef CONFIG_TCP_AO if (static_branch_unlikely(&tcp_ao_needed.key) && tcp_rsk_used_ao(req)) { const struct in6_addr *addr = &ipv6_hdr(skb)->saddr; const struct tcp_ao_hdr *aoh; int l3index; l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0; /* Invalid TCP option size or twice included auth */ if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) return; if (!aoh) return; key.ao_key = tcp_ao_do_lookup(sk, l3index, (union tcp_ao_addr *)addr, AF_INET6, aoh->rnext_keyid, -1); if (unlikely(!key.ao_key)) { /* Send ACK with any matching MKT for the peer */ key.ao_key = tcp_ao_do_lookup(sk, l3index, (union tcp_ao_addr *)addr, AF_INET6, -1, -1); /* Matching key disappeared (user removed the key?) * let the handshake timeout. */ if (!key.ao_key) { net_info_ratelimited("TCP-AO key for (%pI6, %d)->(%pI6, %d) suddenly disappeared, won't ACK new connection\n", addr, ntohs(tcp_hdr(skb)->source), &ipv6_hdr(skb)->daddr, ntohs(tcp_hdr(skb)->dest)); return; } } key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC); if (!key.traffic_key) return; key.type = TCP_KEY_AO; key.rcv_next = aoh->keyid; tcp_v6_ao_calc_key_rsk(key.ao_key, key.traffic_key, req); #else if (0) { #endif #ifdef CONFIG_TCP_MD5SIG } else if (static_branch_unlikely(&tcp_md5_needed.key)) { int l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0; key.md5_key = tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr, l3index); if (key.md5_key) key.type = TCP_KEY_MD5; #endif } /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV * sk->sk_state == TCP_SYN_RECV -> for Fast Open. */ tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, tcp_rsk(req)->rcv_nxt, tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, tcp_rsk_tsval(tcp_rsk(req)), READ_ONCE(req->ts_recent), sk->sk_bound_dev_if, &key, ipv6_get_dsfield(ipv6_hdr(skb)), 0, READ_ONCE(sk->sk_priority), READ_ONCE(tcp_rsk(req)->txhash)); if (tcp_key_is_ao(&key)) kfree(key.traffic_key); } static struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb) { #ifdef CONFIG_SYN_COOKIES const struct tcphdr *th = tcp_hdr(skb); if (!th->syn) sk = cookie_v6_check(sk, skb); #endif return sk; } u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph, struct tcphdr *th, u32 *cookie) { u16 mss = 0; #ifdef CONFIG_SYN_COOKIES mss = tcp_get_syncookie_mss(&tcp6_request_sock_ops, &tcp_request_sock_ipv6_ops, sk, th); if (mss) { *cookie = __cookie_v6_init_sequence(iph, th, &mss); tcp_synq_overflow(sk); } #endif return mss; } static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) { if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_conn_request(sk, skb); if (!ipv6_unicast_destination(skb)) goto drop; if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) { __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS); return 0; } return tcp_conn_request(&tcp6_request_sock_ops, &tcp_request_sock_ipv6_ops, sk, skb); drop: tcp_listendrop(sk); return 0; /* don't send reset */ } static void tcp_v6_restore_cb(struct sk_buff *skb) { /* We need to move header back to the beginning if xfrm6_policy_check() * and tcp_v6_fill_cb() are going to be called again. * ip6_datagram_recv_specific_ctl() also expects IP6CB to be there. */ memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6, sizeof(struct inet6_skb_parm)); } static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, struct request_sock *req_unhash, bool *own_req) { struct inet_request_sock *ireq; struct ipv6_pinfo *newnp; const struct ipv6_pinfo *np = tcp_inet6_sk(sk); struct ipv6_txoptions *opt; struct inet_sock *newinet; bool found_dup_sk = false; struct tcp_sock *newtp; struct sock *newsk; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *key; int l3index; #endif struct flowi6 fl6; if (skb->protocol == htons(ETH_P_IP)) { /* * v6 mapped */ newsk = tcp_v4_syn_recv_sock(sk, skb, req, dst, req_unhash, own_req); if (!newsk) return NULL; inet_sk(newsk)->pinet6 = tcp_inet6_sk(newsk); newnp = tcp_inet6_sk(newsk); newtp = tcp_sk(newsk); memcpy(newnp, np, sizeof(struct ipv6_pinfo)); newnp->saddr = newsk->sk_v6_rcv_saddr; inet_csk(newsk)->icsk_af_ops = &ipv6_mapped; if (sk_is_mptcp(newsk)) mptcpv6_handle_mapped(newsk, true); newsk->sk_backlog_rcv = tcp_v4_do_rcv; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) newtp->af_specific = &tcp_sock_ipv6_mapped_specific; #endif newnp->ipv6_mc_list = NULL; newnp->ipv6_ac_list = NULL; newnp->ipv6_fl_list = NULL; newnp->pktoptions = NULL; newnp->opt = NULL; newnp->mcast_oif = inet_iif(skb); newnp->mcast_hops = ip_hdr(skb)->ttl; newnp->rcv_flowinfo = 0; if (inet6_test_bit(REPFLOW, sk)) newnp->flow_label = 0; /* * No need to charge this sock to the relevant IPv6 refcnt debug socks count * here, tcp_create_openreq_child now does this for us, see the comment in * that function for the gory details. -acme */ /* It is tricky place. Until this moment IPv4 tcp worked with IPv6 icsk.icsk_af_ops. Sync it now. */ tcp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie); return newsk; } ireq = inet_rsk(req); if (sk_acceptq_is_full(sk)) goto out_overflow; if (!dst) { dst = inet6_csk_route_req(sk, &fl6, req, IPPROTO_TCP); if (!dst) goto out; } newsk = tcp_create_openreq_child(sk, req, skb); if (!newsk) goto out_nonewsk; /* * No need to charge this sock to the relevant IPv6 refcnt debug socks * count here, tcp_create_openreq_child now does this for us, see the * comment in that function for the gory details. -acme */ newsk->sk_gso_type = SKB_GSO_TCPV6; inet6_sk_rx_dst_set(newsk, skb); inet_sk(newsk)->pinet6 = tcp_inet6_sk(newsk); newtp = tcp_sk(newsk); newinet = inet_sk(newsk); newnp = tcp_inet6_sk(newsk); memcpy(newnp, np, sizeof(struct ipv6_pinfo)); ip6_dst_store(newsk, dst, NULL, NULL); newsk->sk_v6_daddr = ireq->ir_v6_rmt_addr; newnp->saddr = ireq->ir_v6_loc_addr; newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr; newsk->sk_bound_dev_if = ireq->ir_iif; /* Now IPv6 options... First: no IPv4 options. */ newinet->inet_opt = NULL; newnp->ipv6_mc_list = NULL; newnp->ipv6_ac_list = NULL; newnp->ipv6_fl_list = NULL; /* Clone RX bits */ newnp->rxopt.all = np->rxopt.all; newnp->pktoptions = NULL; newnp->opt = NULL; newnp->mcast_oif = tcp_v6_iif(skb); newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; newnp->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(skb)); if (inet6_test_bit(REPFLOW, sk)) newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb)); /* Set ToS of the new socket based upon the value of incoming SYN. * ECT bits are set later in tcp_init_transfer(). */ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) newnp->tclass = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; /* Clone native IPv6 options from listening socket (if any) Yes, keeping reference count would be much more clever, but we make one more one thing there: reattach optmem to newsk. */ opt = ireq->ipv6_opt; if (!opt) opt = rcu_dereference(np->opt); if (opt) { opt = ipv6_dup_options(newsk, opt); RCU_INIT_POINTER(newnp->opt, opt); } inet_csk(newsk)->icsk_ext_hdr_len = 0; if (opt) inet_csk(newsk)->icsk_ext_hdr_len = opt->opt_nflen + opt->opt_flen; tcp_ca_openreq_child(newsk, dst); tcp_sync_mss(newsk, dst_mtu(dst)); newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); tcp_initialize_rcv_mss(newsk); newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6; newinet->inet_rcv_saddr = LOOPBACK4_IPV6; #ifdef CONFIG_TCP_MD5SIG l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); if (!tcp_rsk_used_ao(req)) { /* Copy over the MD5 key from the original socket */ key = tcp_v6_md5_do_lookup(sk, &newsk->sk_v6_daddr, l3index); if (key) { const union tcp_md5_addr *addr; addr = (union tcp_md5_addr *)&newsk->sk_v6_daddr; if (tcp_md5_key_copy(newsk, addr, AF_INET6, 128, l3index, key)) { inet_csk_prepare_forced_close(newsk); tcp_done(newsk); goto out; } } } #endif #ifdef CONFIG_TCP_AO /* Copy over tcp_ao_info if any */ if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET6)) goto out; /* OOM */ #endif if (__inet_inherit_port(sk, newsk) < 0) { inet_csk_prepare_forced_close(newsk); tcp_done(newsk); goto out; } *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), &found_dup_sk); if (*own_req) { tcp_move_syn(newtp, req); /* Clone pktoptions received with SYN, if we own the req */ if (ireq->pktopts) { newnp->pktoptions = skb_clone_and_charge_r(ireq->pktopts, newsk); consume_skb(ireq->pktopts); ireq->pktopts = NULL; if (newnp->pktoptions) tcp_v6_restore_cb(newnp->pktoptions); } } else { if (!req_unhash && found_dup_sk) { /* This code path should only be executed in the * syncookie case only */ bh_unlock_sock(newsk); sock_put(newsk); newsk = NULL; } } return newsk; out_overflow: __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); out_nonewsk: dst_release(dst); out: tcp_listendrop(sk); return NULL; } INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, u32)); /* The socket must have it's spinlock held when we get * here, unless it is a TCP_LISTEN socket. * * We have a potential double-lock case here, so even when * doing backlog processing we use the BH locking scheme. * This is because we cannot sleep with the original spinlock * held. */ INDIRECT_CALLABLE_SCOPE int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) { struct ipv6_pinfo *np = tcp_inet6_sk(sk); struct sk_buff *opt_skb = NULL; enum skb_drop_reason reason; struct tcp_sock *tp; /* Imagine: socket is IPv6. IPv4 packet arrives, goes to IPv4 receive handler and backlogged. From backlog it always goes here. Kerboom... Fortunately, tcp_rcv_established and rcv_established handle them correctly, but it is not case with tcp_v6_hnd_req and tcp_v6_send_reset(). --ANK */ if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_do_rcv(sk, skb); /* * socket locking is here for SMP purposes as backlog rcv * is currently called with bh processing disabled. */ /* Do Stevens' IPV6_PKTOPTIONS. Yes, guys, it is the only place in our code, where we may make it not affecting IPv4. The rest of code is protocol independent, and I do not like idea to uglify IPv4. Actually, all the idea behind IPV6_PKTOPTIONS looks not very well thought. For now we latch options, received in the last packet, enqueued by tcp. Feel free to propose better solution. --ANK (980728) */ if (np->rxopt.all && sk->sk_state != TCP_LISTEN) opt_skb = skb_clone_and_charge_r(skb, sk); if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ struct dst_entry *dst; dst = rcu_dereference_protected(sk->sk_rx_dst, lockdep_sock_is_held(sk)); sock_rps_save_rxhash(sk, skb); sk_mark_napi_id(sk, skb); if (dst) { if (sk->sk_rx_dst_ifindex != skb->skb_iif || INDIRECT_CALL_1(dst->ops->check, ip6_dst_check, dst, sk->sk_rx_dst_cookie) == NULL) { RCU_INIT_POINTER(sk->sk_rx_dst, NULL); dst_release(dst); } } tcp_rcv_established(sk, skb); if (opt_skb) goto ipv6_pktoptions; return 0; } if (tcp_checksum_complete(skb)) goto csum_err; if (sk->sk_state == TCP_LISTEN) { struct sock *nsk = tcp_v6_cookie_check(sk, skb); if (nsk != sk) { if (nsk) { reason = tcp_child_process(sk, nsk, skb); if (reason) goto reset; } return 0; } } else sock_rps_save_rxhash(sk, skb); reason = tcp_rcv_state_process(sk, skb); if (reason) goto reset; if (opt_skb) goto ipv6_pktoptions; return 0; reset: tcp_v6_send_reset(sk, skb, sk_rst_convert_drop_reason(reason)); discard: if (opt_skb) __kfree_skb(opt_skb); sk_skb_reason_drop(sk, skb, reason); return 0; csum_err: reason = SKB_DROP_REASON_TCP_CSUM; trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; ipv6_pktoptions: /* Do you ask, what is it? 1. skb was enqueued by tcp. 2. skb is added to tail of read queue, rather than out of order. 3. socket is not in passive state. 4. Finally, it really contains options, which user wants to receive. */ tp = tcp_sk(sk); if (TCP_SKB_CB(opt_skb)->end_seq == tp->rcv_nxt && !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo) WRITE_ONCE(np->mcast_oif, tcp_v6_iif(opt_skb)); if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) WRITE_ONCE(np->mcast_hops, ipv6_hdr(opt_skb)->hop_limit); if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass) np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb)); if (inet6_test_bit(REPFLOW, sk)) np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) { tcp_v6_restore_cb(opt_skb); opt_skb = xchg(&np->pktoptions, opt_skb); } else { __kfree_skb(opt_skb); opt_skb = xchg(&np->pktoptions, NULL); } } consume_skb(opt_skb); return 0; } static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr, const struct tcphdr *th) { /* This is tricky: we move IP6CB at its correct location into * TCP_SKB_CB(). It must be done after xfrm6_policy_check(), because * _decode_session6() uses IP6CB(). * barrier() makes sure compiler won't play aliasing games. */ memmove(&TCP_SKB_CB(skb)->header.h6, IP6CB(skb), sizeof(struct inet6_skb_parm)); barrier(); TCP_SKB_CB(skb)->seq = ntohl(th->seq); TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff*4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); TCP_SKB_CB(skb)->sacked = 0; TCP_SKB_CB(skb)->has_rxtstamp = skb->tstamp || skb_hwtstamps(skb)->hwtstamp; } INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) { enum skb_drop_reason drop_reason; int sdif = inet6_sdif(skb); int dif = inet6_iif(skb); const struct tcphdr *th; const struct ipv6hdr *hdr; struct sock *sk = NULL; bool refcounted; int ret; u32 isn; struct net *net = dev_net(skb->dev); drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (skb->pkt_type != PACKET_HOST) goto discard_it; /* * Count it even if it's bad. */ __TCP_INC_STATS(net, TCP_MIB_INSEGS); if (!pskb_may_pull(skb, sizeof(struct tcphdr))) goto discard_it; th = (const struct tcphdr *)skb->data; if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; goto bad_packet; } if (!pskb_may_pull(skb, th->doff*4)) goto discard_it; if (skb_checksum_init(skb, IPPROTO_TCP, ip6_compute_pseudo)) goto csum_error; th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); lookup: sk = __inet6_lookup_skb(net->ipv4.tcp_death_row.hashinfo, skb, __tcp_hdrlen(th), th->source, th->dest, inet6_iif(skb), sdif, &refcounted); if (!sk) goto no_tcp_socket; if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; if (sk->sk_state == TCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); bool req_stolen = false; struct sock *nsk; sk = req->rsk_listener; if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) drop_reason = SKB_DROP_REASON_XFRM_POLICY; else drop_reason = tcp_inbound_hash(sk, req, skb, &hdr->saddr, &hdr->daddr, AF_INET6, dif, sdif); if (drop_reason) { sk_drops_add(sk, skb); reqsk_put(req); goto discard_it; } if (tcp_checksum_complete(skb)) { reqsk_put(req); goto csum_error; } if (unlikely(sk->sk_state != TCP_LISTEN)) { nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); if (!nsk) { inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } sk = nsk; /* reuseport_migrate_sock() has already held one sk_refcnt * before returning. */ } else { sock_hold(sk); } refcounted = true; nsk = NULL; if (!tcp_filter(sk, skb)) { th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); tcp_v6_fill_cb(skb, hdr, th); nsk = tcp_check_req(sk, skb, req, false, &req_stolen); } else { drop_reason = SKB_DROP_REASON_SOCKET_FILTER; } if (!nsk) { reqsk_put(req); if (req_stolen) { /* Another cpu got exclusive access to req * and created a full blown socket. * Try to feed this packet to this socket * instead of discarding it. */ tcp_v6_restore_cb(skb); sock_put(sk); goto lookup; } goto discard_and_relse; } nf_reset_ct(skb); if (nsk == sk) { reqsk_put(req); tcp_v6_restore_cb(skb); } else { drop_reason = tcp_child_process(sk, nsk, skb); if (drop_reason) { enum sk_rst_reason rst_reason; rst_reason = sk_rst_convert_drop_reason(drop_reason); tcp_v6_send_reset(nsk, skb, rst_reason); goto discard_and_relse; } sock_put(sk); return 0; } } process: if (static_branch_unlikely(&ip6_min_hopcount)) { /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */ if (unlikely(hdr->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount))) { __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); drop_reason = SKB_DROP_REASON_TCP_MINTTL; goto discard_and_relse; } } if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) { drop_reason = SKB_DROP_REASON_XFRM_POLICY; goto discard_and_relse; } drop_reason = tcp_inbound_hash(sk, NULL, skb, &hdr->saddr, &hdr->daddr, AF_INET6, dif, sdif); if (drop_reason) goto discard_and_relse; nf_reset_ct(skb); if (tcp_filter(sk, skb)) { drop_reason = SKB_DROP_REASON_SOCKET_FILTER; goto discard_and_relse; } th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); tcp_v6_fill_cb(skb, hdr, th); skb->dev = NULL; if (sk->sk_state == TCP_LISTEN) { ret = tcp_v6_do_rcv(sk, skb); goto put_and_return; } sk_incoming_cpu_update(sk); bh_lock_sock_nested(sk); tcp_segs_in(tcp_sk(sk), skb); ret = 0; if (!sock_owned_by_user(sk)) { ret = tcp_v6_do_rcv(sk, skb); } else { if (tcp_add_backlog(sk, skb, &drop_reason)) goto discard_and_relse; } bh_unlock_sock(sk); put_and_return: if (refcounted) sock_put(sk); return ret ? -1 : 0; no_tcp_socket: drop_reason = SKB_DROP_REASON_NO_SOCKET; if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard_it; tcp_v6_fill_cb(skb, hdr, th); if (tcp_checksum_complete(skb)) { csum_error: drop_reason = SKB_DROP_REASON_TCP_CSUM; trace_tcp_bad_csum(skb); __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); bad_packet: __TCP_INC_STATS(net, TCP_MIB_INERRS); } else { tcp_v6_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason)); } discard_it: SKB_DR_OR(drop_reason, NOT_SPECIFIED); sk_skb_reason_drop(sk, skb, drop_reason); return 0; discard_and_relse: sk_drops_add(sk, skb); if (refcounted) sock_put(sk); goto discard_it; do_time_wait: if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { drop_reason = SKB_DROP_REASON_XFRM_POLICY; inet_twsk_put(inet_twsk(sk)); goto discard_it; } tcp_v6_fill_cb(skb, hdr, th); if (tcp_checksum_complete(skb)) { inet_twsk_put(inet_twsk(sk)); goto csum_error; } switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) { case TCP_TW_SYN: { struct sock *sk2; sk2 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, skb, __tcp_hdrlen(th), &ipv6_hdr(skb)->saddr, th->source, &ipv6_hdr(skb)->daddr, ntohs(th->dest), tcp_v6_iif_l3_slave(skb), sdif); if (sk2) { struct inet_timewait_sock *tw = inet_twsk(sk); inet_twsk_deschedule_put(tw); sk = sk2; tcp_v6_restore_cb(skb); refcounted = false; __this_cpu_write(tcp_tw_isn, isn); goto process; } } /* to ACK */ fallthrough; case TCP_TW_ACK: tcp_v6_timewait_ack(sk, skb); break; case TCP_TW_RST: tcp_v6_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); inet_twsk_deschedule_put(inet_twsk(sk)); goto discard_it; case TCP_TW_SUCCESS: ; } goto discard_it; } void tcp_v6_early_demux(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); const struct ipv6hdr *hdr; const struct tcphdr *th; struct sock *sk; if (skb->pkt_type != PACKET_HOST) return; if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) return; hdr = ipv6_hdr(skb); th = tcp_hdr(skb); if (th->doff < sizeof(struct tcphdr) / 4) return; /* Note : We use inet6_iif() here, not tcp_v6_iif() */ sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, &hdr->saddr, th->source, &hdr->daddr, ntohs(th->dest), inet6_iif(skb), inet6_sdif(skb)); if (sk) { skb->sk = sk; skb->destructor = sock_edemux; if (sk_fullsock(sk)) { struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); if (dst) dst = dst_check(dst, sk->sk_rx_dst_cookie); if (dst && sk->sk_rx_dst_ifindex == skb->skb_iif) skb_dst_set_noref(skb, dst); } } } static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp6_timewait_sock), .twsk_destructor = tcp_twsk_destructor, }; INDIRECT_CALLABLE_SCOPE void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb) { __tcp_v6_send_check(skb, &sk->sk_v6_rcv_saddr, &sk->sk_v6_daddr); } const struct inet_connection_sock_af_ops ipv6_specific = { .queue_xmit = inet6_csk_xmit, .send_check = tcp_v6_send_check, .rebuild_header = inet6_sk_rebuild_header, .sk_rx_dst_set = inet6_sk_rx_dst_set, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, .net_header_len = sizeof(struct ipv6hdr), .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6), .mtu_reduced = tcp_v6_mtu_reduced, }; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) static const struct tcp_sock_af_ops tcp_sock_ipv6_specific = { #ifdef CONFIG_TCP_MD5SIG .md5_lookup = tcp_v6_md5_lookup, .calc_md5_hash = tcp_v6_md5_hash_skb, .md5_parse = tcp_v6_parse_md5_keys, #endif #ifdef CONFIG_TCP_AO .ao_lookup = tcp_v6_ao_lookup, .calc_ao_hash = tcp_v6_ao_hash_skb, .ao_parse = tcp_v6_parse_ao, .ao_calc_key_sk = tcp_v6_ao_calc_key_sk, #endif }; #endif /* * TCP over IPv4 via INET6 API */ static const struct inet_connection_sock_af_ops ipv6_mapped = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, .sk_rx_dst_set = inet_sk_rx_dst_set, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, .net_header_len = sizeof(struct iphdr), .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6), .mtu_reduced = tcp_v4_mtu_reduced, }; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = { #ifdef CONFIG_TCP_MD5SIG .md5_lookup = tcp_v4_md5_lookup, .calc_md5_hash = tcp_v4_md5_hash_skb, .md5_parse = tcp_v6_parse_md5_keys, #endif #ifdef CONFIG_TCP_AO .ao_lookup = tcp_v6_ao_lookup, .calc_ao_hash = tcp_v4_ao_hash_skb, .ao_parse = tcp_v6_parse_ao, .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, #endif }; #endif /* NOTE: A lot of things set to zero explicitly by call to * sk_alloc() so need not be done here. */ static int tcp_v6_init_sock(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); tcp_init_sock(sk); icsk->icsk_af_ops = &ipv6_specific; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific; #endif return 0; } #ifdef CONFIG_PROC_FS /* Proc filesystem TCPv6 sock list dumping. */ static void get_openreq6(struct seq_file *seq, const struct request_sock *req, int i) { long ttd = req->rsk_timer.expires - jiffies; const struct in6_addr *src = &inet_rsk(req)->ir_v6_loc_addr; const struct in6_addr *dest = &inet_rsk(req)->ir_v6_rmt_addr; if (ttd < 0) ttd = 0; seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " "%02X %08X:%08X %02X:%08lX %08X %5u %8d %d %d %pK\n", i, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], inet_rsk(req)->ir_num, dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], ntohs(inet_rsk(req)->ir_rmt_port), TCP_SYN_RECV, 0, 0, /* could print option size, but that is af dependent. */ 1, /* timers active (only the expire timer) */ jiffies_to_clock_t(ttd), req->num_timeout, from_kuid_munged(seq_user_ns(seq), sock_i_uid(req->rsk_listener)), 0, /* non standard timer */ 0, /* open_requests have no inode */ 0, req); } static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) { const struct in6_addr *dest, *src; __u16 destp, srcp; int timer_active; unsigned long timer_expires; const struct inet_sock *inet = inet_sk(sp); const struct tcp_sock *tp = tcp_sk(sp); const struct inet_connection_sock *icsk = inet_csk(sp); const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; u8 icsk_pending; int rx_queue; int state; dest = &sp->sk_v6_daddr; src = &sp->sk_v6_rcv_saddr; destp = ntohs(inet->inet_dport); srcp = ntohs(inet->inet_sport); icsk_pending = smp_load_acquire(&icsk->icsk_pending); if (icsk_pending == ICSK_TIME_RETRANS || icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; timer_expires = icsk->icsk_timeout; } else if (icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; timer_expires = icsk->icsk_timeout; } else if (timer_pending(&sp->sk_timer)) { timer_active = 2; timer_expires = sp->sk_timer.expires; } else { timer_active = 0; timer_expires = jiffies; } state = inet_sk_state_load(sp); if (state == TCP_LISTEN) rx_queue = READ_ONCE(sp->sk_ack_backlog); else /* Because we don't lock the socket, * we might find a transient negative value. */ rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq), 0); seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %lu %lu %u %u %d\n", i, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], srcp, dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], destp, state, READ_ONCE(tp->write_seq) - tp->snd_una, rx_queue, timer_active, jiffies_delta_to_clock_t(timer_expires - jiffies), icsk->icsk_retransmits, from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), icsk->icsk_probes_out, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, jiffies_to_clock_t(icsk->icsk_rto), jiffies_to_clock_t(icsk->icsk_ack.ato), (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sp), tcp_snd_cwnd(tp), state == TCP_LISTEN ? fastopenq->max_qlen : (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh) ); } static void get_timewait6_sock(struct seq_file *seq, struct inet_timewait_sock *tw, int i) { long delta = tw->tw_timer.expires - jiffies; const struct in6_addr *dest, *src; __u16 destp, srcp; dest = &tw->tw_v6_daddr; src = &tw->tw_v6_rcv_saddr; destp = ntohs(tw->tw_dport); srcp = ntohs(tw->tw_sport); seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK\n", i, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], srcp, dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], destp, READ_ONCE(tw->tw_substate), 0, 0, 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, refcount_read(&tw->tw_refcnt), tw); } static int tcp6_seq_show(struct seq_file *seq, void *v) { struct tcp_iter_state *st; struct sock *sk = v; if (v == SEQ_START_TOKEN) { seq_puts(seq, " sl " "local_address " "remote_address " "st tx_queue rx_queue tr tm->when retrnsmt" " uid timeout inode\n"); goto out; } st = seq->private; if (sk->sk_state == TCP_TIME_WAIT) get_timewait6_sock(seq, v, st->num); else if (sk->sk_state == TCP_NEW_SYN_RECV) get_openreq6(seq, v, st->num); else get_tcp6_sock(seq, v, st->num); out: return 0; } static const struct seq_operations tcp6_seq_ops = { .show = tcp6_seq_show, .start = tcp_seq_start, .next = tcp_seq_next, .stop = tcp_seq_stop, }; static struct tcp_seq_afinfo tcp6_seq_afinfo = { .family = AF_INET6, }; int __net_init tcp6_proc_init(struct net *net) { if (!proc_create_net_data("tcp6", 0444, net->proc_net, &tcp6_seq_ops, sizeof(struct tcp_iter_state), &tcp6_seq_afinfo)) return -ENOMEM; return 0; } void tcp6_proc_exit(struct net *net) { remove_proc_entry("tcp6", net->proc_net); } #endif struct proto tcpv6_prot = { .name = "TCPv6", .owner = THIS_MODULE, .close = tcp_close, .pre_connect = tcp_v6_pre_connect, .connect = tcp_v6_connect, .disconnect = tcp_disconnect, .accept = inet_csk_accept, .ioctl = tcp_ioctl, .init = tcp_v6_init_sock, .destroy = tcp_v4_destroy_sock, .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, .splice_eof = tcp_splice_eof, .backlog_rcv = tcp_v6_do_rcv, .release_cb = tcp_release_cb, .hash = inet6_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, .put_port = inet_put_port, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = tcp_bpf_update_proto, #endif .enter_memory_pressure = tcp_enter_memory_pressure, .leave_memory_pressure = tcp_leave_memory_pressure, .stream_memory_free = tcp_stream_memory_free, .sockets_allocated = &tcp_sockets_allocated, .memory_allocated = &tcp_memory_allocated, .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, .memory_pressure = &tcp_memory_pressure, .orphan_count = &tcp_orphan_count, .sysctl_mem = sysctl_tcp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp6_sock), .ipv6_pinfo_offset = offsetof(struct tcp6_sock, inet6), .slab_flags = SLAB_TYPESAFE_BY_RCU, .twsk_prot = &tcp6_timewait_sock_ops, .rsk_prot = &tcp6_request_sock_ops, .h.hashinfo = NULL, .no_autobind = true, .diag_destroy = tcp_abort, }; EXPORT_SYMBOL_GPL(tcpv6_prot); static struct inet_protosw tcpv6_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_TCP, .prot = &tcpv6_prot, .ops = &inet6_stream_ops, .flags = INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK, }; static int __net_init tcpv6_net_init(struct net *net) { int res; res = inet_ctl_sock_create(&net->ipv6.tcp_sk, PF_INET6, SOCK_RAW, IPPROTO_TCP, net); if (!res) net->ipv6.tcp_sk->sk_clockid = CLOCK_MONOTONIC; return res; } static void __net_exit tcpv6_net_exit(struct net *net) { inet_ctl_sock_destroy(net->ipv6.tcp_sk); } static struct pernet_operations tcpv6_net_ops = { .init = tcpv6_net_init, .exit = tcpv6_net_exit, }; int __init tcpv6_init(void) { int ret; net_hotdata.tcpv6_protocol = (struct inet6_protocol) { .handler = tcp_v6_rcv, .err_handler = tcp_v6_err, .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL, }; ret = inet6_add_protocol(&net_hotdata.tcpv6_protocol, IPPROTO_TCP); if (ret) goto out; /* register inet6 protocol */ ret = inet6_register_protosw(&tcpv6_protosw); if (ret) goto out_tcpv6_protocol; ret = register_pernet_subsys(&tcpv6_net_ops); if (ret) goto out_tcpv6_protosw; ret = mptcpv6_init(); if (ret) goto out_tcpv6_pernet_subsys; out: return ret; out_tcpv6_pernet_subsys: unregister_pernet_subsys(&tcpv6_net_ops); out_tcpv6_protosw: inet6_unregister_protosw(&tcpv6_protosw); out_tcpv6_protocol: inet6_del_protocol(&net_hotdata.tcpv6_protocol, IPPROTO_TCP); goto out; } void tcpv6_exit(void) { unregister_pernet_subsys(&tcpv6_net_ops); inet6_unregister_protosw(&tcpv6_protosw); inet6_del_protocol(&net_hotdata.tcpv6_protocol, IPPROTO_TCP); } |
| 66 65 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 | // SPDX-License-Identifier: GPL-2.0 /* * linux/mm/madvise.c * * Copyright (C) 1999 Linus Torvalds * Copyright (C) 2002 Christoph Hellwig */ #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/mempolicy.h> #include <linux/page-isolation.h> #include <linux/page_idle.h> #include <linux/userfaultfd_k.h> #include <linux/hugetlb.h> #include <linux/falloc.h> #include <linux/fadvise.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/mm_inline.h> #include <linux/string.h> #include <linux/uio.h> #include <linux/ksm.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/pagewalk.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/shmem_fs.h> #include <linux/mmu_notifier.h> #include <asm/tlb.h> #include "internal.h" #include "swap.h" /* * Maximum number of attempts we make to install guard pages before we give up * and return -ERESTARTNOINTR to have userspace try again. */ #define MAX_MADVISE_GUARD_RETRIES 3 struct madvise_walk_private { struct mmu_gather *tlb; bool pageout; }; /* * Any behaviour which results in changes to the vma->vm_flags needs to * take mmap_lock for writing. Others, which simply traverse vmas, need * to only take it for reading. */ static int madvise_need_mmap_write(int behavior) { switch (behavior) { case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: case MADV_COLD: case MADV_PAGEOUT: case MADV_FREE: case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: case MADV_COLLAPSE: case MADV_GUARD_INSTALL: case MADV_GUARD_REMOVE: return 0; default: /* be safe, default to 1. list exceptions explicitly */ return 1; } } #ifdef CONFIG_ANON_VMA_NAME struct anon_vma_name *anon_vma_name_alloc(const char *name) { struct anon_vma_name *anon_name; size_t count; /* Add 1 for NUL terminator at the end of the anon_name->name */ count = strlen(name) + 1; anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL); if (anon_name) { kref_init(&anon_name->kref); memcpy(anon_name->name, name, count); } return anon_name; } void anon_vma_name_free(struct kref *kref) { struct anon_vma_name *anon_name = container_of(kref, struct anon_vma_name, kref); kfree(anon_name); } struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) { mmap_assert_locked(vma->vm_mm); return vma->anon_name; } /* mmap_lock should be write-locked */ static int replace_anon_vma_name(struct vm_area_struct *vma, struct anon_vma_name *anon_name) { struct anon_vma_name *orig_name = anon_vma_name(vma); if (!anon_name) { vma->anon_name = NULL; anon_vma_name_put(orig_name); return 0; } if (anon_vma_name_eq(orig_name, anon_name)) return 0; vma->anon_name = anon_vma_name_reuse(anon_name); anon_vma_name_put(orig_name); return 0; } #else /* CONFIG_ANON_VMA_NAME */ static int replace_anon_vma_name(struct vm_area_struct *vma, struct anon_vma_name *anon_name) { if (anon_name) return -EINVAL; return 0; } #endif /* CONFIG_ANON_VMA_NAME */ /* * Update the vm_flags on region of a vma, splitting it or merging it as * necessary. Must be called with mmap_lock held for writing; * Caller should ensure anon_name stability by raising its refcount even when * anon_name belongs to a valid vma because this function might free that vma. */ static int madvise_update_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, unsigned long new_flags, struct anon_vma_name *anon_name) { struct mm_struct *mm = vma->vm_mm; int error; VMA_ITERATOR(vmi, mm, start); if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) { *prev = vma; return 0; } vma = vma_modify_flags_name(&vmi, *prev, vma, start, end, new_flags, anon_name); if (IS_ERR(vma)) return PTR_ERR(vma); *prev = vma; /* vm_flags is protected by the mmap_lock held in write mode. */ vma_start_write(vma); vm_flags_reset(vma, new_flags); if (!vma->vm_file || vma_is_anon_shmem(vma)) { error = replace_anon_vma_name(vma, anon_name); if (error) return error; } return 0; } #ifdef CONFIG_SWAP static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->private; struct swap_iocb *splug = NULL; pte_t *ptep = NULL; spinlock_t *ptl; unsigned long addr; for (addr = start; addr < end; addr += PAGE_SIZE) { pte_t pte; swp_entry_t entry; struct folio *folio; if (!ptep++) { ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (!ptep) break; } pte = ptep_get(ptep); if (!is_swap_pte(pte)) continue; entry = pte_to_swp_entry(pte); if (unlikely(non_swap_entry(entry))) continue; pte_unmap_unlock(ptep, ptl); ptep = NULL; folio = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, vma, addr, &splug); if (folio) folio_put(folio); } if (ptep) pte_unmap_unlock(ptep, ptl); swap_read_unplug(splug); cond_resched(); return 0; } static const struct mm_walk_ops swapin_walk_ops = { .pmd_entry = swapin_walk_pmd_entry, .walk_lock = PGWALK_RDLOCK, }; static void shmem_swapin_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct address_space *mapping) { XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start)); pgoff_t end_index = linear_page_index(vma, end) - 1; struct folio *folio; struct swap_iocb *splug = NULL; rcu_read_lock(); xas_for_each(&xas, folio, end_index) { unsigned long addr; swp_entry_t entry; if (!xa_is_value(folio)) continue; entry = radix_to_swp_entry(folio); /* There might be swapin error entries in shmem mapping. */ if (non_swap_entry(entry)) continue; addr = vma->vm_start + ((xas.xa_index - vma->vm_pgoff) << PAGE_SHIFT); xas_pause(&xas); rcu_read_unlock(); folio = read_swap_cache_async(entry, mapping_gfp_mask(mapping), vma, addr, &splug); if (folio) folio_put(folio); rcu_read_lock(); } rcu_read_unlock(); swap_read_unplug(splug); } #endif /* CONFIG_SWAP */ /* * Schedule all required I/O operations. Do not wait for completion. */ static long madvise_willneed(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { struct mm_struct *mm = vma->vm_mm; struct file *file = vma->vm_file; loff_t offset; *prev = vma; #ifdef CONFIG_SWAP if (!file) { walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma); lru_add_drain(); /* Push any new pages onto the LRU now */ return 0; } if (shmem_mapping(file->f_mapping)) { shmem_swapin_range(vma, start, end, file->f_mapping); lru_add_drain(); /* Push any new pages onto the LRU now */ return 0; } #else if (!file) return -EBADF; #endif if (IS_DAX(file_inode(file))) { /* no bad return value, but ignore advice */ return 0; } /* * Filesystem's fadvise may need to take various locks. We need to * explicitly grab a reference because the vma (and hence the * vma's reference to the file) can go away as soon as we drop * mmap_lock. */ *prev = NULL; /* tell sys_madvise we drop mmap_lock */ get_file(file); offset = (loff_t)(start - vma->vm_start) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); mmap_read_unlock(mm); vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); fput(file); mmap_read_lock(mm); return 0; } static inline bool can_do_file_pageout(struct vm_area_struct *vma) { if (!vma->vm_file) return false; /* * paging out pagecache only for non-anonymous mappings that correspond * to the files the calling process could (if tried) open for writing; * otherwise we'd be including shared non-exclusive mappings, which * opens a side channel. */ return inode_owner_or_capable(&nop_mnt_idmap, file_inode(vma->vm_file)) || file_permission(vma->vm_file, MAY_WRITE) == 0; } static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end, struct folio *folio, pte_t *ptep, pte_t pte, bool *any_young, bool *any_dirty) { const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; int max_nr = (end - addr) / PAGE_SIZE; return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL, any_young, any_dirty); } static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct madvise_walk_private *private = walk->private; struct mmu_gather *tlb = private->tlb; bool pageout = private->pageout; struct mm_struct *mm = tlb->mm; struct vm_area_struct *vma = walk->vma; pte_t *start_pte, *pte, ptent; spinlock_t *ptl; struct folio *folio = NULL; LIST_HEAD(folio_list); bool pageout_anon_only_filter; unsigned int batch_count = 0; int nr; if (fatal_signal_pending(current)) return -EINTR; pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) && !can_do_file_pageout(vma); #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (pmd_trans_huge(*pmd)) { pmd_t orig_pmd; unsigned long next = pmd_addr_end(addr, end); tlb_change_page_size(tlb, HPAGE_PMD_SIZE); ptl = pmd_trans_huge_lock(pmd, vma); if (!ptl) return 0; orig_pmd = *pmd; if (is_huge_zero_pmd(orig_pmd)) goto huge_unlock; if (unlikely(!pmd_present(orig_pmd))) { VM_BUG_ON(thp_migration_supported() && !is_pmd_migration_entry(orig_pmd)); goto huge_unlock; } folio = pmd_folio(orig_pmd); /* Do not interfere with other mappings of this folio */ if (folio_likely_mapped_shared(folio)) goto huge_unlock; if (pageout_anon_only_filter && !folio_test_anon(folio)) goto huge_unlock; if (next - addr != HPAGE_PMD_SIZE) { int err; folio_get(folio); spin_unlock(ptl); folio_lock(folio); err = split_folio(folio); folio_unlock(folio); folio_put(folio); if (!err) goto regular_folio; return 0; } if (!pageout && pmd_young(orig_pmd)) { pmdp_invalidate(vma, addr, pmd); orig_pmd = pmd_mkold(orig_pmd); set_pmd_at(mm, addr, pmd, orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); } folio_clear_referenced(folio); folio_test_clear_young(folio); if (folio_test_active(folio)) folio_set_workingset(folio); if (pageout) { if (folio_isolate_lru(folio)) { if (folio_test_unevictable(folio)) folio_putback_lru(folio); else list_add(&folio->lru, &folio_list); } } else folio_deactivate(folio); huge_unlock: spin_unlock(ptl); if (pageout) reclaim_pages(&folio_list); return 0; } regular_folio: #endif tlb_change_page_size(tlb, PAGE_SIZE); restart: start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (!start_pte) return 0; flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) { nr = 1; ptent = ptep_get(pte); if (++batch_count == SWAP_CLUSTER_MAX) { batch_count = 0; if (need_resched()) { arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); cond_resched(); goto restart; } } if (pte_none(ptent)) continue; if (!pte_present(ptent)) continue; folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; /* * If we encounter a large folio, only split it if it is not * fully mapped within the range we are operating on. Otherwise * leave it as is so that it can be swapped out whole. If we * fail to split a folio, leave it in place and advance to the * next pte in the range. */ if (folio_test_large(folio)) { bool any_young; nr = madvise_folio_pte_batch(addr, end, folio, pte, ptent, &any_young, NULL); if (any_young) ptent = pte_mkyoung(ptent); if (nr < folio_nr_pages(folio)) { int err; if (folio_likely_mapped_shared(folio)) continue; if (pageout_anon_only_filter && !folio_test_anon(folio)) continue; if (!folio_trylock(folio)) continue; folio_get(folio); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); start_pte = NULL; err = split_folio(folio); folio_unlock(folio); folio_put(folio); start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!start_pte) break; arch_enter_lazy_mmu_mode(); if (!err) nr = 0; continue; } } /* * Do not interfere with other mappings of this folio and * non-LRU folio. If we have a large folio at this point, we * know it is fully mapped so if its mapcount is the same as its * number of pages, it must be exclusive. */ if (!folio_test_lru(folio) || folio_mapcount(folio) != folio_nr_pages(folio)) continue; if (pageout_anon_only_filter && !folio_test_anon(folio)) continue; if (!pageout && pte_young(ptent)) { clear_young_dirty_ptes(vma, addr, pte, nr, CYDP_CLEAR_YOUNG); tlb_remove_tlb_entries(tlb, pte, nr, addr); } /* * We are deactivating a folio for accelerating reclaiming. * VM couldn't reclaim the folio unless we clear PG_young. * As a side effect, it makes confuse idle-page tracking * because they will miss recent referenced history. */ folio_clear_referenced(folio); folio_test_clear_young(folio); if (folio_test_active(folio)) folio_set_workingset(folio); if (pageout) { if (folio_isolate_lru(folio)) { if (folio_test_unevictable(folio)) folio_putback_lru(folio); else list_add(&folio->lru, &folio_list); } } else folio_deactivate(folio); } if (start_pte) { arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); } if (pageout) reclaim_pages(&folio_list); cond_resched(); return 0; } static const struct mm_walk_ops cold_walk_ops = { .pmd_entry = madvise_cold_or_pageout_pte_range, .walk_lock = PGWALK_RDLOCK, }; static void madvise_cold_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { struct madvise_walk_private walk_private = { .pageout = false, .tlb = tlb, }; tlb_start_vma(tlb, vma); walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); tlb_end_vma(tlb, vma); } static inline bool can_madv_lru_vma(struct vm_area_struct *vma) { return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB)); } static long madvise_cold(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start_addr, unsigned long end_addr) { struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; *prev = vma; if (!can_madv_lru_vma(vma)) return -EINVAL; lru_add_drain(); tlb_gather_mmu(&tlb, mm); madvise_cold_page_range(&tlb, vma, start_addr, end_addr); tlb_finish_mmu(&tlb); return 0; } static void madvise_pageout_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { struct madvise_walk_private walk_private = { .pageout = true, .tlb = tlb, }; tlb_start_vma(tlb, vma); walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); tlb_end_vma(tlb, vma); } static long madvise_pageout(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start_addr, unsigned long end_addr) { struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; *prev = vma; if (!can_madv_lru_vma(vma)) return -EINVAL; /* * If the VMA belongs to a private file mapping, there can be private * dirty pages which can be paged out if even this process is neither * owner nor write capable of the file. We allow private file mappings * further to pageout dirty anon pages. */ if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) && (vma->vm_flags & VM_MAYSHARE))) return 0; lru_add_drain(); tlb_gather_mmu(&tlb, mm); madvise_pageout_page_range(&tlb, vma, start_addr, end_addr); tlb_finish_mmu(&tlb); return 0; } static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY; struct mmu_gather *tlb = walk->private; struct mm_struct *mm = tlb->mm; struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; pte_t *start_pte, *pte, ptent; struct folio *folio; int nr_swap = 0; unsigned long next; int nr, max_nr; next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd)) if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) return 0; tlb_change_page_size(tlb, PAGE_SIZE); start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!start_pte) return 0; flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) { nr = 1; ptent = ptep_get(pte); if (pte_none(ptent)) continue; /* * If the pte has swp_entry, just clear page table to * prevent swap-in which is more expensive rather than * (page allocation + zeroing). */ if (!pte_present(ptent)) { swp_entry_t entry; entry = pte_to_swp_entry(ptent); if (!non_swap_entry(entry)) { max_nr = (end - addr) / PAGE_SIZE; nr = swap_pte_batch(pte, max_nr, ptent); nr_swap -= nr; free_swap_and_cache_nr(entry, nr); clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) { pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } continue; } folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; /* * If we encounter a large folio, only split it if it is not * fully mapped within the range we are operating on. Otherwise * leave it as is so that it can be marked as lazyfree. If we * fail to split a folio, leave it in place and advance to the * next pte in the range. */ if (folio_test_large(folio)) { bool any_young, any_dirty; nr = madvise_folio_pte_batch(addr, end, folio, pte, ptent, &any_young, &any_dirty); if (nr < folio_nr_pages(folio)) { int err; if (folio_likely_mapped_shared(folio)) continue; if (!folio_trylock(folio)) continue; folio_get(folio); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); start_pte = NULL; err = split_folio(folio); folio_unlock(folio); folio_put(folio); pte = pte_offset_map_lock(mm, pmd, addr, &ptl); start_pte = pte; if (!start_pte) break; arch_enter_lazy_mmu_mode(); if (!err) nr = 0; continue; } if (any_young) ptent = pte_mkyoung(ptent); if (any_dirty) ptent = pte_mkdirty(ptent); } if (folio_test_swapcache(folio) || folio_test_dirty(folio)) { if (!folio_trylock(folio)) continue; /* * If we have a large folio at this point, we know it is * fully mapped so if its mapcount is the same as its * number of pages, it must be exclusive. */ if (folio_mapcount(folio) != folio_nr_pages(folio)) { folio_unlock(folio); continue; } if (folio_test_swapcache(folio) && !folio_free_swap(folio)) { folio_unlock(folio); continue; } folio_clear_dirty(folio); folio_unlock(folio); } if (pte_young(ptent) || pte_dirty(ptent)) { clear_young_dirty_ptes(vma, addr, pte, nr, cydp_flags); tlb_remove_tlb_entries(tlb, pte, nr, addr); } folio_mark_lazyfree(folio); } if (nr_swap) add_mm_counter(mm, MM_SWAPENTS, nr_swap); if (start_pte) { arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); } cond_resched(); return 0; } static const struct mm_walk_ops madvise_free_walk_ops = { .pmd_entry = madvise_free_pte_range, .walk_lock = PGWALK_RDLOCK, }; static int madvise_free_single_vma(struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr) { struct mm_struct *mm = vma->vm_mm; struct mmu_notifier_range range; struct mmu_gather tlb; /* MADV_FREE works for only anon vma at the moment */ if (!vma_is_anonymous(vma)) return -EINVAL; range.start = max(vma->vm_start, start_addr); if (range.start >= vma->vm_end) return -EINVAL; range.end = min(vma->vm_end, end_addr); if (range.end <= vma->vm_start) return -EINVAL; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, range.start, range.end); lru_add_drain(); tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(&range); tlb_start_vma(&tlb, vma); walk_page_range(vma->vm_mm, range.start, range.end, &madvise_free_walk_ops, &tlb); tlb_end_vma(&tlb, vma); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); return 0; } /* * Application no longer needs these pages. If the pages are dirty, * it's OK to just throw them away. The app will be more careful about * data it wants to keep. Be sure to free swap resources too. The * zap_page_range_single call sets things up for shrink_active_list to actually * free these pages later if no one else has touched them in the meantime, * although we could add these pages to a global reuse list for * shrink_active_list to pick up before reclaiming other pages. * * NB: This interface discards data rather than pushes it out to swap, * as some implementations do. This has performance implications for * applications like large transactional databases which want to discard * pages in anonymous maps after committing to backing store the data * that was kept in them. There is no reason to write this data out to * the swap area if the application is discarding it. * * An interface that causes the system to free clean pages and flush * dirty pages is already available as msync(MS_INVALIDATE). */ static long madvise_dontneed_single_vma(struct vm_area_struct *vma, unsigned long start, unsigned long end) { zap_page_range_single(vma, start, end - start, NULL); return 0; } static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, unsigned long start, unsigned long *end, int behavior) { if (!is_vm_hugetlb_page(vma)) { unsigned int forbidden = VM_PFNMAP; if (behavior != MADV_DONTNEED_LOCKED) forbidden |= VM_LOCKED; return !(vma->vm_flags & forbidden); } if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED) return false; if (start & ~huge_page_mask(hstate_vma(vma))) return false; /* * Madvise callers expect the length to be rounded up to PAGE_SIZE * boundaries, and may be unaware that this VMA uses huge pages. * Avoid unexpected data loss by rounding down the number of * huge pages freed. */ *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma))); return true; } static long madvise_dontneed_free(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, int behavior) { struct mm_struct *mm = vma->vm_mm; *prev = vma; if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior)) return -EINVAL; if (start == end) return 0; if (!userfaultfd_remove(vma, start, end)) { *prev = NULL; /* mmap_lock has been dropped, prev is stale */ mmap_read_lock(mm); vma = vma_lookup(mm, start); if (!vma) return -ENOMEM; /* * Potential end adjustment for hugetlb vma is OK as * the check below keeps end within vma. */ if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior)) return -EINVAL; if (end > vma->vm_end) { /* * Don't fail if end > vma->vm_end. If the old * vma was split while the mmap_lock was * released the effect of the concurrent * operation may not cause madvise() to * have an undefined result. There may be an * adjacent next vma that we'll walk * next. userfaultfd_remove() will generate an * UFFD_EVENT_REMOVE repetition on the * end-vma->vm_end range, but the manager can * handle a repetition fine. */ end = vma->vm_end; } VM_WARN_ON(start >= end); } if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED) return madvise_dontneed_single_vma(vma, start, end); else if (behavior == MADV_FREE) return madvise_free_single_vma(vma, start, end); else return -EINVAL; } static long madvise_populate(struct mm_struct *mm, unsigned long start, unsigned long end, int behavior) { const bool write = behavior == MADV_POPULATE_WRITE; int locked = 1; long pages; while (start < end) { /* Populate (prefault) page tables readable/writable. */ pages = faultin_page_range(mm, start, end, write, &locked); if (!locked) { mmap_read_lock(mm); locked = 1; } if (pages < 0) { switch (pages) { case -EINTR: return -EINTR; case -EINVAL: /* Incompatible mappings / permissions. */ return -EINVAL; case -EHWPOISON: return -EHWPOISON; case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */ return -EFAULT; default: pr_warn_once("%s: unhandled return value: %ld\n", __func__, pages); fallthrough; case -ENOMEM: /* No VMA or out of memory. */ return -ENOMEM; } } start += pages * PAGE_SIZE; } return 0; } /* * Application wants to free up the pages and associated backing store. * This is effectively punching a hole into the middle of a file. */ static long madvise_remove(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { loff_t offset; int error; struct file *f; struct mm_struct *mm = vma->vm_mm; *prev = NULL; /* tell sys_madvise we drop mmap_lock */ if (vma->vm_flags & VM_LOCKED) return -EINVAL; f = vma->vm_file; if (!f || !f->f_mapping || !f->f_mapping->host) { return -EINVAL; } if (!vma_is_shared_maywrite(vma)) return -EACCES; offset = (loff_t)(start - vma->vm_start) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); /* * Filesystem's fallocate may need to take i_rwsem. We need to * explicitly grab a reference because the vma (and hence the * vma's reference to the file) can go away as soon as we drop * mmap_lock. */ get_file(f); if (userfaultfd_remove(vma, start, end)) { /* mmap_lock was not released by userfaultfd_remove() */ mmap_read_unlock(mm); } error = vfs_fallocate(f, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, end - start); fput(f); mmap_read_lock(mm); return error; } static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked) { vm_flags_t disallowed = VM_SPECIAL | VM_HUGETLB; /* * A user could lock after setting a guard range but that's fine, as * they'd not be able to fault in. The issue arises when we try to zap * existing locked VMAs. We don't want to do that. */ if (!allow_locked) disallowed |= VM_LOCKED; if (!vma_is_anonymous(vma)) return false; if ((vma->vm_flags & (VM_MAYWRITE | disallowed)) != VM_MAYWRITE) return false; return true; } static bool is_guard_pte_marker(pte_t ptent) { return is_pte_marker(ptent) && is_guard_swp_entry(pte_to_swp_entry(ptent)); } static int guard_install_pud_entry(pud_t *pud, unsigned long addr, unsigned long next, struct mm_walk *walk) { pud_t pudval = pudp_get(pud); /* If huge return >0 so we abort the operation + zap. */ return pud_trans_huge(pudval) || pud_devmap(pudval); } static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { pmd_t pmdval = pmdp_get(pmd); /* If huge return >0 so we abort the operation + zap. */ return pmd_trans_huge(pmdval) || pmd_devmap(pmdval); } static int guard_install_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { pte_t pteval = ptep_get(pte); unsigned long *nr_pages = (unsigned long *)walk->private; /* If there is already a guard page marker, we have nothing to do. */ if (is_guard_pte_marker(pteval)) { (*nr_pages)++; return 0; } /* If populated return >0 so we abort the operation + zap. */ return 1; } static int guard_install_set_pte(unsigned long addr, unsigned long next, pte_t *ptep, struct mm_walk *walk) { unsigned long *nr_pages = (unsigned long *)walk->private; /* Simply install a PTE marker, this causes segfault on access. */ *ptep = make_pte_marker(PTE_MARKER_GUARD); (*nr_pages)++; return 0; } static const struct mm_walk_ops guard_install_walk_ops = { .pud_entry = guard_install_pud_entry, .pmd_entry = guard_install_pmd_entry, .pte_entry = guard_install_pte_entry, .install_pte = guard_install_set_pte, .walk_lock = PGWALK_RDLOCK, }; static long madvise_guard_install(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { long err; int i; *prev = vma; if (!is_valid_guard_vma(vma, /* allow_locked = */false)) return -EINVAL; /* * If we install guard markers, then the range is no longer * empty from a page table perspective and therefore it's * appropriate to have an anon_vma. * * This ensures that on fork, we copy page tables correctly. */ err = anon_vma_prepare(vma); if (err) return err; /* * Optimistically try to install the guard marker pages first. If any * non-guard pages are encountered, give up and zap the range before * trying again. * * We try a few times before giving up and releasing back to userland to * loop around, releasing locks in the process to avoid contention. This * would only happen if there was a great many racing page faults. * * In most cases we should simply install the guard markers immediately * with no zap or looping. */ for (i = 0; i < MAX_MADVISE_GUARD_RETRIES; i++) { unsigned long nr_pages = 0; /* Returns < 0 on error, == 0 if success, > 0 if zap needed. */ err = walk_page_range_mm(vma->vm_mm, start, end, &guard_install_walk_ops, &nr_pages); if (err < 0) return err; if (err == 0) { unsigned long nr_expected_pages = PHYS_PFN(end - start); VM_WARN_ON(nr_pages != nr_expected_pages); return 0; } /* * OK some of the range have non-guard pages mapped, zap * them. This leaves existing guard pages in place. */ zap_page_range_single(vma, start, end - start, NULL); } /* * We were unable to install the guard pages due to being raced by page * faults. This should not happen ordinarily. We return to userspace and * immediately retry, relieving lock contention. */ return restart_syscall(); } static int guard_remove_pud_entry(pud_t *pud, unsigned long addr, unsigned long next, struct mm_walk *walk) { pud_t pudval = pudp_get(pud); /* If huge, cannot have guard pages present, so no-op - skip. */ if (pud_trans_huge(pudval) || pud_devmap(pudval)) walk->action = ACTION_CONTINUE; return 0; } static int guard_remove_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { pmd_t pmdval = pmdp_get(pmd); /* If huge, cannot have guard pages present, so no-op - skip. */ if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval)) walk->action = ACTION_CONTINUE; return 0; } static int guard_remove_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { pte_t ptent = ptep_get(pte); if (is_guard_pte_marker(ptent)) { /* Simply clear the PTE marker. */ pte_clear_not_present_full(walk->mm, addr, pte, false); update_mmu_cache(walk->vma, addr, pte); } return 0; } static const struct mm_walk_ops guard_remove_walk_ops = { .pud_entry = guard_remove_pud_entry, .pmd_entry = guard_remove_pmd_entry, .pte_entry = guard_remove_pte_entry, .walk_lock = PGWALK_RDLOCK, }; static long madvise_guard_remove(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { *prev = vma; /* * We're ok with removing guards in mlock()'d ranges, as this is a * non-destructive action. */ if (!is_valid_guard_vma(vma, /* allow_locked = */true)) return -EINVAL; return walk_page_range(vma->vm_mm, start, end, &guard_remove_walk_ops, NULL); } /* * Apply an madvise behavior to a region of a vma. madvise_update_vma * will handle splitting a vm area into separate areas, each area with its own * behavior. */ static int madvise_vma_behavior(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, unsigned long behavior) { int error; struct anon_vma_name *anon_name; unsigned long new_flags = vma->vm_flags; if (unlikely(!can_modify_vma_madv(vma, behavior))) return -EPERM; switch (behavior) { case MADV_REMOVE: return madvise_remove(vma, prev, start, end); case MADV_WILLNEED: return madvise_willneed(vma, prev, start, end); case MADV_COLD: return madvise_cold(vma, prev, start, end); case MADV_PAGEOUT: return madvise_pageout(vma, prev, start, end); case MADV_FREE: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: return madvise_dontneed_free(vma, prev, start, end, behavior); case MADV_NORMAL: new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; break; case MADV_SEQUENTIAL: new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; break; case MADV_RANDOM: new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; break; case MADV_DONTFORK: new_flags |= VM_DONTCOPY; break; case MADV_DOFORK: if (vma->vm_flags & VM_IO) return -EINVAL; new_flags &= ~VM_DONTCOPY; break; case MADV_WIPEONFORK: /* MADV_WIPEONFORK is only supported on anonymous memory. */ if (vma->vm_file || vma->vm_flags & VM_SHARED) return -EINVAL; new_flags |= VM_WIPEONFORK; break; case MADV_KEEPONFORK: if (vma->vm_flags & VM_DROPPABLE) return -EINVAL; new_flags &= ~VM_WIPEONFORK; break; case MADV_DONTDUMP: new_flags |= VM_DONTDUMP; break; case MADV_DODUMP: if ((!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) || (vma->vm_flags & VM_DROPPABLE)) return -EINVAL; new_flags &= ~VM_DONTDUMP; break; case MADV_MERGEABLE: case MADV_UNMERGEABLE: error = ksm_madvise(vma, start, end, behavior, &new_flags); if (error) goto out; break; case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: error = hugepage_madvise(vma, &new_flags, behavior); if (error) goto out; break; case MADV_COLLAPSE: return madvise_collapse(vma, prev, start, end); case MADV_GUARD_INSTALL: return madvise_guard_install(vma, prev, start, end); case MADV_GUARD_REMOVE: return madvise_guard_remove(vma, prev, start, end); } anon_name = anon_vma_name(vma); anon_vma_name_get(anon_name); error = madvise_update_vma(vma, prev, start, end, new_flags, anon_name); anon_vma_name_put(anon_name); out: /* * madvise() returns EAGAIN if kernel resources, such as * slab, are temporarily unavailable. */ if (error == -ENOMEM) error = -EAGAIN; return error; } #ifdef CONFIG_MEMORY_FAILURE /* * Error injection support for memory error handling. */ static int madvise_inject_error(int behavior, unsigned long start, unsigned long end) { unsigned long size; if (!capable(CAP_SYS_ADMIN)) return -EPERM; for (; start < end; start += size) { unsigned long pfn; struct page *page; int ret; ret = get_user_pages_fast(start, 1, 0, &page); if (ret != 1) return ret; pfn = page_to_pfn(page); /* * When soft offlining hugepages, after migrating the page * we dissolve it, therefore in the second loop "page" will * no longer be a compound page. */ size = page_size(compound_head(page)); if (behavior == MADV_SOFT_OFFLINE) { pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", pfn, start); ret = soft_offline_page(pfn, MF_COUNT_INCREASED); } else { pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", pfn, start); ret = memory_failure(pfn, MF_ACTION_REQUIRED | MF_COUNT_INCREASED | MF_SW_SIMULATED); if (ret == -EOPNOTSUPP) ret = 0; } if (ret) return ret; } return 0; } #endif static bool madvise_behavior_valid(int behavior) { switch (behavior) { case MADV_DOFORK: case MADV_DONTFORK: case MADV_NORMAL: case MADV_SEQUENTIAL: case MADV_RANDOM: case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: case MADV_FREE: case MADV_COLD: case MADV_PAGEOUT: case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: #ifdef CONFIG_KSM case MADV_MERGEABLE: case MADV_UNMERGEABLE: #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: case MADV_COLLAPSE: #endif case MADV_DONTDUMP: case MADV_DODUMP: case MADV_WIPEONFORK: case MADV_KEEPONFORK: case MADV_GUARD_INSTALL: case MADV_GUARD_REMOVE: #ifdef CONFIG_MEMORY_FAILURE case MADV_SOFT_OFFLINE: case MADV_HWPOISON: #endif return true; default: return false; } } /* Can we invoke process_madvise() on a remote mm for the specified behavior? */ static bool process_madvise_remote_valid(int behavior) { switch (behavior) { case MADV_COLD: case MADV_PAGEOUT: case MADV_WILLNEED: case MADV_COLLAPSE: return true; default: return false; } } /* * Walk the vmas in range [start,end), and call the visit function on each one. * The visit function will get start and end parameters that cover the overlap * between the current vma and the original range. Any unmapped regions in the * original range will result in this function returning -ENOMEM while still * calling the visit function on all of the existing vmas in the range. * Must be called with the mmap_lock held for reading or writing. */ static int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long arg, int (*visit)(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, unsigned long arg)) { struct vm_area_struct *vma; struct vm_area_struct *prev; unsigned long tmp; int unmapped_error = 0; /* * If the interval [start,end) covers some unmapped address * ranges, just ignore them, but return -ENOMEM at the end. * - different from the way of handling in mlock etc. */ vma = find_vma_prev(mm, start, &prev); if (vma && start > vma->vm_start) prev = vma; for (;;) { int error; /* Still start < end. */ if (!vma) return -ENOMEM; /* Here start < (end|vma->vm_end). */ if (start < vma->vm_start) { unmapped_error = -ENOMEM; start = vma->vm_start; if (start >= end) break; } /* Here vma->vm_start <= start < (end|vma->vm_end) */ tmp = vma->vm_end; if (end < tmp) tmp = end; /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ error = visit(vma, &prev, start, tmp, arg); if (error) return error; start = tmp; if (prev && start < prev->vm_end) start = prev->vm_end; if (start >= end) break; if (prev) vma = find_vma(mm, prev->vm_end); else /* madvise_remove dropped mmap_lock */ vma = find_vma(mm, start); } return unmapped_error; } #ifdef CONFIG_ANON_VMA_NAME static int madvise_vma_anon_name(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, unsigned long anon_name) { int error; /* Only anonymous mappings can be named */ if (vma->vm_file && !vma_is_anon_shmem(vma)) return -EBADF; error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, (struct anon_vma_name *)anon_name); /* * madvise() returns EAGAIN if kernel resources, such as * slab, are temporarily unavailable. */ if (error == -ENOMEM) error = -EAGAIN; return error; } int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, unsigned long len_in, struct anon_vma_name *anon_name) { unsigned long end; unsigned long len; if (start & ~PAGE_MASK) return -EINVAL; len = (len_in + ~PAGE_MASK) & PAGE_MASK; /* Check to see whether len was rounded up from small -ve to zero */ if (len_in && !len) return -EINVAL; end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name, madvise_vma_anon_name); } #endif /* CONFIG_ANON_VMA_NAME */ /* * The madvise(2) system call. * * Applications can use madvise() to advise the kernel how it should * handle paging I/O in this VM area. The idea is to help the kernel * use appropriate read-ahead and caching techniques. The information * provided is advisory only, and can be safely disregarded by the * kernel without affecting the correct operation of the application. * * behavior values: * MADV_NORMAL - the default behavior is to read clusters. This * results in some read-ahead and read-behind. * MADV_RANDOM - the system should read the minimum amount of data * on any access, since it is unlikely that the appli- * cation will need more than what it asks for. * MADV_SEQUENTIAL - pages in the given range will probably be accessed * once, so they can be aggressively read ahead, and * can be freed soon after they are accessed. * MADV_WILLNEED - the application is notifying the system to read * some pages ahead. * MADV_DONTNEED - the application is finished with the given range, * so the kernel can free resources associated with it. * MADV_FREE - the application marks pages in the given range as lazy free, * where actual purges are postponed until memory pressure happens. * MADV_REMOVE - the application wants to free up the given range of * pages and associated backing store. * MADV_DONTFORK - omit this area from child's address space when forking: * typically, to avoid COWing pages pinned by get_user_pages(). * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. * MADV_WIPEONFORK - present the child process with zero-filled memory in this * range after a fork. * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK * MADV_HWPOISON - trigger memory error handler as if the given memory range * were corrupted by unrecoverable hardware memory failure. * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. * MADV_MERGEABLE - the application recommends that KSM try to merge pages in * this area with pages of identical content from other such areas. * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. * MADV_HUGEPAGE - the application wants to back the given range by transparent * huge pages in the future. Existing pages might be coalesced and * new pages might be allocated as THP. * MADV_NOHUGEPAGE - mark the given range as not worth being backed by * transparent huge pages so the existing pages will not be * coalesced into THP and new pages will not be allocated as THP. * MADV_COLLAPSE - synchronously coalesce pages into new THP. * MADV_DONTDUMP - the application wants to prevent pages in the given range * from being included in its core dump. * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. * MADV_COLD - the application is not expected to use this memory soon, * deactivate pages in this range so that they can be reclaimed * easily if memory pressure happens. * MADV_PAGEOUT - the application is not expected to use this memory soon, * page out the pages in this range immediately. * MADV_POPULATE_READ - populate (prefault) page tables readable by * triggering read faults if required * MADV_POPULATE_WRITE - populate (prefault) page tables writable by * triggering write faults if required * * return values: * zero - success * -EINVAL - start + len < 0, start is not page-aligned, * "behavior" is not a valid value, or application * is attempting to release locked or shared pages, * or the specified address range includes file, Huge TLB, * MAP_SHARED or VMPFNMAP range. * -ENOMEM - addresses in the specified range are not currently * mapped, or are outside the AS of the process. * -EIO - an I/O error occurred while paging in data. * -EBADF - map exists, but area maps something that isn't a file. * -EAGAIN - a kernel resource was temporarily unavailable. * -EPERM - memory is sealed. */ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) { unsigned long end; int error; int write; size_t len; struct blk_plug plug; if (!madvise_behavior_valid(behavior)) return -EINVAL; if (!PAGE_ALIGNED(start)) return -EINVAL; len = PAGE_ALIGN(len_in); /* Check to see whether len was rounded up from small -ve to zero */ if (len_in && !len) return -EINVAL; end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; #ifdef CONFIG_MEMORY_FAILURE if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) return madvise_inject_error(behavior, start, start + len_in); #endif write = madvise_need_mmap_write(behavior); if (write) { if (mmap_write_lock_killable(mm)) return -EINTR; } else { mmap_read_lock(mm); } start = untagged_addr_remote(mm, start); end = start + len; blk_start_plug(&plug); switch (behavior) { case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: error = madvise_populate(mm, start, end, behavior); break; default: error = madvise_walk_vmas(mm, start, end, behavior, madvise_vma_behavior); break; } blk_finish_plug(&plug); if (write) mmap_write_unlock(mm); else mmap_read_unlock(mm); return error; } SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) { return do_madvise(current->mm, start, len_in, behavior); } /* Perform an madvise operation over a vector of addresses and lengths. */ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, int behavior) { ssize_t ret = 0; size_t total_len; total_len = iov_iter_count(iter); while (iov_iter_count(iter)) { ret = do_madvise(mm, (unsigned long)iter_iov_addr(iter), iter_iov_len(iter), behavior); /* * An madvise operation is attempting to restart the syscall, * but we cannot proceed as it would not be correct to repeat * the operation in aggregate, and would be surprising to the * user. * * As we have already dropped locks, it is safe to just loop and * try again. We check for fatal signals in case we need exit * early anyway. */ if (ret == -ERESTARTNOINTR) { if (fatal_signal_pending(current)) { ret = -EINTR; break; } continue; } if (ret < 0) break; iov_iter_advance(iter, iter_iov_len(iter)); } ret = (total_len - iov_iter_count(iter)) ? : ret; return ret; } SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, size_t, vlen, int, behavior, unsigned int, flags) { ssize_t ret; struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; struct task_struct *task; struct mm_struct *mm; unsigned int f_flags; if (flags != 0) { ret = -EINVAL; goto out; } ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) goto out; task = pidfd_get_task(pidfd, &f_flags); if (IS_ERR(task)) { ret = PTR_ERR(task); goto free_iov; } /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); if (IS_ERR(mm)) { ret = PTR_ERR(mm); goto release_task; } /* * We need only perform this check if we are attempting to manipulate a * remote process's address space. */ if (mm != current->mm && !process_madvise_remote_valid(behavior)) { ret = -EINVAL; goto release_mm; } /* * Require CAP_SYS_NICE for influencing process performance. Note that * only non-destructive hints are currently supported for remote * processes. */ if (mm != current->mm && !capable(CAP_SYS_NICE)) { ret = -EPERM; goto release_mm; } ret = vector_madvise(mm, &iter, behavior); release_mm: mmput(mm); release_task: put_task_struct(task); free_iov: kfree(iov); out: return ret; } |
| 8562 3883 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_NODEMASK_H #define __LINUX_NODEMASK_H /* * Nodemasks provide a bitmap suitable for representing the * set of Node's in a system, one bit position per Node number. * * See detailed comments in the file linux/bitmap.h describing the * data type on which these nodemasks are based. * * For details of nodemask_parse_user(), see bitmap_parse_user() in * lib/bitmap.c. For details of nodelist_parse(), see bitmap_parselist(), * also in bitmap.c. For details of node_remap(), see bitmap_bitremap in * lib/bitmap.c. For details of nodes_remap(), see bitmap_remap in * lib/bitmap.c. For details of nodes_onto(), see bitmap_onto in * lib/bitmap.c. For details of nodes_fold(), see bitmap_fold in * lib/bitmap.c. * * The available nodemask operations are: * * void node_set(node, mask) turn on bit 'node' in mask * void node_clear(node, mask) turn off bit 'node' in mask * void nodes_setall(mask) set all bits * void nodes_clear(mask) clear all bits * int node_isset(node, mask) true iff bit 'node' set in mask * int node_test_and_set(node, mask) test and set bit 'node' in mask * * void nodes_and(dst, src1, src2) dst = src1 & src2 [intersection] * void nodes_or(dst, src1, src2) dst = src1 | src2 [union] * void nodes_xor(dst, src1, src2) dst = src1 ^ src2 * void nodes_andnot(dst, src1, src2) dst = src1 & ~src2 * void nodes_complement(dst, src) dst = ~src * * int nodes_equal(mask1, mask2) Does mask1 == mask2? * int nodes_intersects(mask1, mask2) Do mask1 and mask2 intersect? * int nodes_subset(mask1, mask2) Is mask1 a subset of mask2? * int nodes_empty(mask) Is mask empty (no bits sets)? * int nodes_full(mask) Is mask full (all bits sets)? * int nodes_weight(mask) Hamming weight - number of set bits * * void nodes_shift_right(dst, src, n) Shift right * void nodes_shift_left(dst, src, n) Shift left * * unsigned int first_node(mask) Number lowest set bit, or MAX_NUMNODES * unsigend int next_node(node, mask) Next node past 'node', or MAX_NUMNODES * unsigned int next_node_in(node, mask) Next node past 'node', or wrap to first, * or MAX_NUMNODES * unsigned int first_unset_node(mask) First node not set in mask, or * MAX_NUMNODES * * nodemask_t nodemask_of_node(node) Return nodemask with bit 'node' set * NODE_MASK_ALL Initializer - all bits set * NODE_MASK_NONE Initializer - no bits set * unsigned long *nodes_addr(mask) Array of unsigned long's in mask * * int nodemask_parse_user(ubuf, ulen, mask) Parse ascii string as nodemask * int nodelist_parse(buf, map) Parse ascii string as nodelist * int node_remap(oldbit, old, new) newbit = map(old, new)(oldbit) * void nodes_remap(dst, src, old, new) *dst = map(old, new)(src) * void nodes_onto(dst, orig, relmap) *dst = orig relative to relmap * void nodes_fold(dst, orig, sz) dst bits = orig bits mod sz * * for_each_node_mask(node, mask) for-loop node over mask * * int num_online_nodes() Number of online Nodes * int num_possible_nodes() Number of all possible Nodes * * int node_random(mask) Random node with set bit in mask * * int node_online(node) Is some node online? * int node_possible(node) Is some node possible? * * node_set_online(node) set bit 'node' in node_online_map * node_set_offline(node) clear bit 'node' in node_online_map * * for_each_node(node) for-loop node over node_possible_map * for_each_online_node(node) for-loop node over node_online_map * * Subtlety: * 1) The 'type-checked' form of node_isset() causes gcc (3.3.2, anyway) * to generate slightly worse code. So use a simple one-line #define * for node_isset(), instead of wrapping an inline inside a macro, the * way we do the other calls. * * NODEMASK_SCRATCH * When doing above logical AND, OR, XOR, Remap operations the callers tend to * need temporary nodemask_t's on the stack. But if NODES_SHIFT is large, * nodemask_t's consume too much stack space. NODEMASK_SCRATCH is a helper * for such situations. See below and CPUMASK_ALLOC also. */ #include <linux/threads.h> #include <linux/bitmap.h> #include <linux/minmax.h> #include <linux/nodemask_types.h> #include <linux/numa.h> #include <linux/random.h> extern nodemask_t _unused_nodemask_arg_; /** * nodemask_pr_args - printf args to output a nodemask * @maskp: nodemask to be printed * * Can be used to provide arguments for '%*pb[l]' when printing a nodemask. */ #define nodemask_pr_args(maskp) __nodemask_pr_numnodes(maskp), \ __nodemask_pr_bits(maskp) static __always_inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m) { return m ? MAX_NUMNODES : 0; } static __always_inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m) { return m ? m->bits : NULL; } /* * The inline keyword gives the compiler room to decide to inline, or * not inline a function as it sees best. However, as these functions * are called in both __init and non-__init functions, if they are not * inlined we will end up with a section mismatch error (of the type of * freeable items not being freed). So we must use __always_inline here * to fix the problem. If other functions in the future also end up in * this situation they will also need to be annotated as __always_inline */ #define node_set(node, dst) __node_set((node), &(dst)) static __always_inline void __node_set(int node, volatile nodemask_t *dstp) { set_bit(node, dstp->bits); } #define node_clear(node, dst) __node_clear((node), &(dst)) static __always_inline void __node_clear(int node, volatile nodemask_t *dstp) { clear_bit(node, dstp->bits); } #define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES) static __always_inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits) { bitmap_fill(dstp->bits, nbits); } #define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES) static __always_inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits) { bitmap_zero(dstp->bits, nbits); } /* No static inline type checking - see Subtlety (1) above. */ #define node_isset(node, nodemask) test_bit((node), (nodemask).bits) #define node_test_and_set(node, nodemask) \ __node_test_and_set((node), &(nodemask)) static __always_inline bool __node_test_and_set(int node, nodemask_t *addr) { return test_and_set_bit(node, addr->bits); } #define nodes_and(dst, src1, src2) \ __nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES) static __always_inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits); } #define nodes_or(dst, src1, src2) \ __nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES) static __always_inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits); } #define nodes_xor(dst, src1, src2) \ __nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES) static __always_inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits); } #define nodes_andnot(dst, src1, src2) \ __nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES) static __always_inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); } #define nodes_complement(dst, src) \ __nodes_complement(&(dst), &(src), MAX_NUMNODES) static __always_inline void __nodes_complement(nodemask_t *dstp, const nodemask_t *srcp, unsigned int nbits) { bitmap_complement(dstp->bits, srcp->bits, nbits); } #define nodes_equal(src1, src2) \ __nodes_equal(&(src1), &(src2), MAX_NUMNODES) static __always_inline bool __nodes_equal(const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { return bitmap_equal(src1p->bits, src2p->bits, nbits); } #define nodes_intersects(src1, src2) \ __nodes_intersects(&(src1), &(src2), MAX_NUMNODES) static __always_inline bool __nodes_intersects(const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { return bitmap_intersects(src1p->bits, src2p->bits, nbits); } #define nodes_subset(src1, src2) \ __nodes_subset(&(src1), &(src2), MAX_NUMNODES) static __always_inline bool __nodes_subset(const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { return bitmap_subset(src1p->bits, src2p->bits, nbits); } #define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES) static __always_inline bool __nodes_empty(const nodemask_t *srcp, unsigned int nbits) { return bitmap_empty(srcp->bits, nbits); } #define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES) static __always_inline bool __nodes_full(const nodemask_t *srcp, unsigned int nbits) { return bitmap_full(srcp->bits, nbits); } #define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES) static __always_inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits) { return bitmap_weight(srcp->bits, nbits); } #define nodes_shift_right(dst, src, n) \ __nodes_shift_right(&(dst), &(src), (n), MAX_NUMNODES) static __always_inline void __nodes_shift_right(nodemask_t *dstp, const nodemask_t *srcp, int n, int nbits) { bitmap_shift_right(dstp->bits, srcp->bits, n, nbits); } #define nodes_shift_left(dst, src, n) \ __nodes_shift_left(&(dst), &(src), (n), MAX_NUMNODES) static __always_inline void __nodes_shift_left(nodemask_t *dstp, const nodemask_t *srcp, int n, int nbits) { bitmap_shift_left(dstp->bits, srcp->bits, n, nbits); } /* FIXME: better would be to fix all architectures to never return > MAX_NUMNODES, then the silly min_ts could be dropped. */ #define first_node(src) __first_node(&(src)) static __always_inline unsigned int __first_node(const nodemask_t *srcp) { return min_t(unsigned int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES)); } #define next_node(n, src) __next_node((n), &(src)) static __always_inline unsigned int __next_node(int n, const nodemask_t *srcp) { return min_t(unsigned int, MAX_NUMNODES, find_next_bit(srcp->bits, MAX_NUMNODES, n+1)); } /* * Find the next present node in src, starting after node n, wrapping around to * the first node in src if needed. Returns MAX_NUMNODES if src is empty. */ #define next_node_in(n, src) __next_node_in((n), &(src)) static __always_inline unsigned int __next_node_in(int node, const nodemask_t *srcp) { unsigned int ret = __next_node(node, srcp); if (ret == MAX_NUMNODES) ret = __first_node(srcp); return ret; } static __always_inline void init_nodemask_of_node(nodemask_t *mask, int node) { nodes_clear(*mask); node_set(node, *mask); } #define nodemask_of_node(node) \ ({ \ typeof(_unused_nodemask_arg_) m; \ if (sizeof(m) == sizeof(unsigned long)) { \ m.bits[0] = 1UL << (node); \ } else { \ init_nodemask_of_node(&m, (node)); \ } \ m; \ }) #define first_unset_node(mask) __first_unset_node(&(mask)) static __always_inline unsigned int __first_unset_node(const nodemask_t *maskp) { return min_t(unsigned int, MAX_NUMNODES, find_first_zero_bit(maskp->bits, MAX_NUMNODES)); } #define NODE_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(MAX_NUMNODES) #if MAX_NUMNODES <= BITS_PER_LONG #define NODE_MASK_ALL \ ((nodemask_t) { { \ [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD \ } }) #else #define NODE_MASK_ALL \ ((nodemask_t) { { \ [0 ... BITS_TO_LONGS(MAX_NUMNODES)-2] = ~0UL, \ [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD \ } }) #endif #define NODE_MASK_NONE \ ((nodemask_t) { { \ [0 ... BITS_TO_LONGS(MAX_NUMNODES)-1] = 0UL \ } }) #define nodes_addr(src) ((src).bits) #define nodemask_parse_user(ubuf, ulen, dst) \ __nodemask_parse_user((ubuf), (ulen), &(dst), MAX_NUMNODES) static __always_inline int __nodemask_parse_user(const char __user *buf, int len, nodemask_t *dstp, int nbits) { return bitmap_parse_user(buf, len, dstp->bits, nbits); } #define nodelist_parse(buf, dst) __nodelist_parse((buf), &(dst), MAX_NUMNODES) static __always_inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits) { return bitmap_parselist(buf, dstp->bits, nbits); } #define node_remap(oldbit, old, new) \ __node_remap((oldbit), &(old), &(new), MAX_NUMNODES) static __always_inline int __node_remap(int oldbit, const nodemask_t *oldp, const nodemask_t *newp, int nbits) { return bitmap_bitremap(oldbit, oldp->bits, newp->bits, nbits); } #define nodes_remap(dst, src, old, new) \ __nodes_remap(&(dst), &(src), &(old), &(new), MAX_NUMNODES) static __always_inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp, const nodemask_t *oldp, const nodemask_t *newp, int nbits) { bitmap_remap(dstp->bits, srcp->bits, oldp->bits, newp->bits, nbits); } #define nodes_onto(dst, orig, relmap) \ __nodes_onto(&(dst), &(orig), &(relmap), MAX_NUMNODES) static __always_inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp, const nodemask_t *relmapp, int nbits) { bitmap_onto(dstp->bits, origp->bits, relmapp->bits, nbits); } #define nodes_fold(dst, orig, sz) \ __nodes_fold(&(dst), &(orig), sz, MAX_NUMNODES) static __always_inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp, int sz, int nbits) { bitmap_fold(dstp->bits, origp->bits, sz, nbits); } #if MAX_NUMNODES > 1 #define for_each_node_mask(node, mask) \ for ((node) = first_node(mask); \ (node) < MAX_NUMNODES; \ (node) = next_node((node), (mask))) #else /* MAX_NUMNODES == 1 */ #define for_each_node_mask(node, mask) \ for ((node) = 0; (node) < 1 && !nodes_empty(mask); (node)++) #endif /* MAX_NUMNODES */ /* * Bitmasks that are kept for all the nodes. */ enum node_states { N_POSSIBLE, /* The node could become online at some point */ N_ONLINE, /* The node is online */ N_NORMAL_MEMORY, /* The node has regular memory */ #ifdef CONFIG_HIGHMEM N_HIGH_MEMORY, /* The node has regular or high memory */ #else N_HIGH_MEMORY = N_NORMAL_MEMORY, #endif N_MEMORY, /* The node has memory(regular, high, movable) */ N_CPU, /* The node has one or more cpus */ N_GENERIC_INITIATOR, /* The node has one or more Generic Initiators */ NR_NODE_STATES }; /* * The following particular system nodemasks and operations * on them manage all possible and online nodes. */ extern nodemask_t node_states[NR_NODE_STATES]; #if MAX_NUMNODES > 1 static __always_inline int node_state(int node, enum node_states state) { return node_isset(node, node_states[state]); } static __always_inline void node_set_state(int node, enum node_states state) { __node_set(node, &node_states[state]); } static __always_inline void node_clear_state(int node, enum node_states state) { __node_clear(node, &node_states[state]); } static __always_inline int num_node_state(enum node_states state) { return nodes_weight(node_states[state]); } #define for_each_node_state(__node, __state) \ for_each_node_mask((__node), node_states[__state]) #define first_online_node first_node(node_states[N_ONLINE]) #define first_memory_node first_node(node_states[N_MEMORY]) static __always_inline unsigned int next_online_node(int nid) { return next_node(nid, node_states[N_ONLINE]); } static __always_inline unsigned int next_memory_node(int nid) { return next_node(nid, node_states[N_MEMORY]); } extern unsigned int nr_node_ids; extern unsigned int nr_online_nodes; static __always_inline void node_set_online(int nid) { node_set_state(nid, N_ONLINE); nr_online_nodes = num_node_state(N_ONLINE); } static __always_inline void node_set_offline(int nid) { node_clear_state(nid, N_ONLINE); nr_online_nodes = num_node_state(N_ONLINE); } #else static __always_inline int node_state(int node, enum node_states state) { return node == 0; } static __always_inline void node_set_state(int node, enum node_states state) { } static __always_inline void node_clear_state(int node, enum node_states state) { } static __always_inline int num_node_state(enum node_states state) { return 1; } #define for_each_node_state(node, __state) \ for ( (node) = 0; (node) == 0; (node) = 1) #define first_online_node 0 #define first_memory_node 0 #define next_online_node(nid) (MAX_NUMNODES) #define next_memory_node(nid) (MAX_NUMNODES) #define nr_node_ids 1U #define nr_online_nodes 1U #define node_set_online(node) node_set_state((node), N_ONLINE) #define node_set_offline(node) node_clear_state((node), N_ONLINE) #endif static __always_inline int node_random(const nodemask_t *maskp) { #if defined(CONFIG_NUMA) && (MAX_NUMNODES > 1) int w, bit; w = nodes_weight(*maskp); switch (w) { case 0: bit = NUMA_NO_NODE; break; case 1: bit = first_node(*maskp); break; default: bit = find_nth_bit(maskp->bits, MAX_NUMNODES, get_random_u32_below(w)); break; } return bit; #else return 0; #endif } #define node_online_map node_states[N_ONLINE] #define node_possible_map node_states[N_POSSIBLE] #define num_online_nodes() num_node_state(N_ONLINE) #define num_possible_nodes() num_node_state(N_POSSIBLE) #define node_online(node) node_state((node), N_ONLINE) #define node_possible(node) node_state((node), N_POSSIBLE) #define for_each_node(node) for_each_node_state(node, N_POSSIBLE) #define for_each_online_node(node) for_each_node_state(node, N_ONLINE) /* * For nodemask scratch area. * NODEMASK_ALLOC(type, name) allocates an object with a specified type and * name. */ #if NODES_SHIFT > 8 /* nodemask_t > 32 bytes */ #define NODEMASK_ALLOC(type, name, gfp_flags) \ type *name = kmalloc(sizeof(*name), gfp_flags) #define NODEMASK_FREE(m) kfree(m) #else #define NODEMASK_ALLOC(type, name, gfp_flags) type _##name, *name = &_##name #define NODEMASK_FREE(m) do {} while (0) #endif /* Example structure for using NODEMASK_ALLOC, used in mempolicy. */ struct nodemask_scratch { nodemask_t mask1; nodemask_t mask2; }; #define NODEMASK_SCRATCH(x) \ NODEMASK_ALLOC(struct nodemask_scratch, x, \ GFP_KERNEL | __GFP_NORETRY) #define NODEMASK_SCRATCH_FREE(x) NODEMASK_FREE(x) #endif /* __LINUX_NODEMASK_H */ |
| 3 39 39 9 9 9 9 9 9 7 2 2 2 4 39 39 39 39 39 39 39 39 5 34 8 47 39 39 39 39 34 5 39 7 5 25 2 39 9 39 39 39 39 14 17 10 5 2 5 4 4 1 4 4 1 1 3 1 40 10 10 7 3 55 2 1 12 1 7 48 1 47 47 44 5 3 1 1 1 58 58 5 3 2 3 2 3 2 5 5 42 52 52 52 52 27 13 15 15 1 42 2 1 1 1 2 1 1 2 2 11 11 5 11 11 8 8 1 2 29 13 16 9 6 11 4 7 8 11 4 11 4 15 12 3 4 11 11 3 1 12 2 9 6 10 5 8 7 10 5 11 4 12 3 14 1 11 10 4 14 1 13 2 12 3 13 2 14 1 13 1 13 2 10 2 2 1 16 15 39 39 37 42 42 42 42 42 42 3 42 5 5 40 41 1 40 40 3 37 39 39 37 2 39 39 39 45 3 1 2 39 47 5 44 7 3 40 45 5 6 43 57 50 7 9 9 9 9 6 9 9 15 11 11 11 4 4 4 28 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 | /* FUSE: Filesystem in Userspace Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> This program can be distributed under the terms of the GNU GPL. See the file COPYING. */ #include "fuse_i.h" #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/seq_file.h> #include <linux/init.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/statfs.h> #include <linux/random.h> #include <linux/sched.h> #include <linux/exportfs.h> #include <linux/posix_acl.h> #include <linux/pid_namespace.h> #include <uapi/linux/magic.h> MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); MODULE_DESCRIPTION("Filesystem in Userspace"); MODULE_LICENSE("GPL"); static struct kmem_cache *fuse_inode_cachep; struct list_head fuse_conn_list; DEFINE_MUTEX(fuse_mutex); static int set_global_limit(const char *val, const struct kernel_param *kp); unsigned int fuse_max_pages_limit = 256; unsigned max_user_bgreq; module_param_call(max_user_bgreq, set_global_limit, param_get_uint, &max_user_bgreq, 0644); __MODULE_PARM_TYPE(max_user_bgreq, "uint"); MODULE_PARM_DESC(max_user_bgreq, "Global limit for the maximum number of backgrounded requests an " "unprivileged user can set"); unsigned max_user_congthresh; module_param_call(max_user_congthresh, set_global_limit, param_get_uint, &max_user_congthresh, 0644); __MODULE_PARM_TYPE(max_user_congthresh, "uint"); MODULE_PARM_DESC(max_user_congthresh, "Global limit for the maximum congestion threshold an " "unprivileged user can set"); #define FUSE_DEFAULT_BLKSIZE 512 /** Maximum number of outstanding background requests */ #define FUSE_DEFAULT_MAX_BACKGROUND 12 /** Congestion starts at 75% of maximum */ #define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4) #ifdef CONFIG_BLOCK static struct file_system_type fuseblk_fs_type; #endif struct fuse_forget_link *fuse_alloc_forget(void) { return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL_ACCOUNT); } static struct fuse_submount_lookup *fuse_alloc_submount_lookup(void) { struct fuse_submount_lookup *sl; sl = kzalloc(sizeof(struct fuse_submount_lookup), GFP_KERNEL_ACCOUNT); if (!sl) return NULL; sl->forget = fuse_alloc_forget(); if (!sl->forget) goto out_free; return sl; out_free: kfree(sl); return NULL; } static struct inode *fuse_alloc_inode(struct super_block *sb) { struct fuse_inode *fi; fi = alloc_inode_sb(sb, fuse_inode_cachep, GFP_KERNEL); if (!fi) return NULL; fi->i_time = 0; fi->inval_mask = ~0; fi->nodeid = 0; fi->nlookup = 0; fi->attr_version = 0; fi->orig_ino = 0; fi->state = 0; fi->submount_lookup = NULL; mutex_init(&fi->mutex); spin_lock_init(&fi->lock); fi->forget = fuse_alloc_forget(); if (!fi->forget) goto out_free; if (IS_ENABLED(CONFIG_FUSE_DAX) && !fuse_dax_inode_alloc(sb, fi)) goto out_free_forget; if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) fuse_inode_backing_set(fi, NULL); return &fi->inode; out_free_forget: kfree(fi->forget); out_free: kmem_cache_free(fuse_inode_cachep, fi); return NULL; } static void fuse_free_inode(struct inode *inode) { struct fuse_inode *fi = get_fuse_inode(inode); mutex_destroy(&fi->mutex); kfree(fi->forget); #ifdef CONFIG_FUSE_DAX kfree(fi->dax); #endif if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) fuse_backing_put(fuse_inode_backing(fi)); kmem_cache_free(fuse_inode_cachep, fi); } static void fuse_cleanup_submount_lookup(struct fuse_conn *fc, struct fuse_submount_lookup *sl) { if (!refcount_dec_and_test(&sl->count)) return; fuse_queue_forget(fc, sl->forget, sl->nodeid, 1); sl->forget = NULL; kfree(sl); } static void fuse_evict_inode(struct inode *inode) { struct fuse_inode *fi = get_fuse_inode(inode); /* Will write inode on close/munmap and in all other dirtiers */ WARN_ON(inode->i_state & I_DIRTY_INODE); truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (inode->i_sb->s_flags & SB_ACTIVE) { struct fuse_conn *fc = get_fuse_conn(inode); if (FUSE_IS_DAX(inode)) fuse_dax_inode_cleanup(inode); if (fi->nlookup) { fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup); fi->forget = NULL; } if (fi->submount_lookup) { fuse_cleanup_submount_lookup(fc, fi->submount_lookup); fi->submount_lookup = NULL; } /* * Evict of non-deleted inode may race with outstanding * LOOKUP/READDIRPLUS requests and result in inconsistency when * the request finishes. Deal with that here by bumping a * counter that can be compared to the starting value. */ if (inode->i_nlink > 0) atomic64_inc(&fc->evict_ctr); } if (S_ISREG(inode->i_mode) && !fuse_is_bad(inode)) { WARN_ON(fi->iocachectr != 0); WARN_ON(!list_empty(&fi->write_files)); WARN_ON(!list_empty(&fi->queued_writes)); } } static int fuse_reconfigure(struct fs_context *fsc) { struct super_block *sb = fsc->root->d_sb; sync_filesystem(sb); if (fsc->sb_flags & SB_MANDLOCK) return -EINVAL; return 0; } /* * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down * so that it will fit. */ static ino_t fuse_squash_ino(u64 ino64) { ino_t ino = (ino_t) ino64; if (sizeof(ino_t) < sizeof(u64)) ino ^= ino64 >> (sizeof(u64) - sizeof(ino_t)) * 8; return ino; } void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, struct fuse_statx *sx, u64 attr_valid, u32 cache_mask, u64 evict_ctr) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); lockdep_assert_held(&fi->lock); /* * Clear basic stats from invalid mask. * * Don't do this if this is coming from a fuse_iget() call and there * might have been a racing evict which would've invalidated the result * if the attr_version would've been preserved. * * !evict_ctr -> this is create * fi->attr_version != 0 -> this is not a new inode * evict_ctr == fuse_get_evict_ctr() -> no evicts while during request */ if (!evict_ctr || fi->attr_version || evict_ctr == fuse_get_evict_ctr(fc)) set_mask_bits(&fi->inval_mask, STATX_BASIC_STATS, 0); fi->attr_version = atomic64_inc_return(&fc->attr_version); fi->i_time = attr_valid; inode->i_ino = fuse_squash_ino(attr->ino); inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); set_nlink(inode, attr->nlink); inode->i_uid = make_kuid(fc->user_ns, attr->uid); inode->i_gid = make_kgid(fc->user_ns, attr->gid); inode->i_blocks = attr->blocks; /* Sanitize nsecs */ attr->atimensec = min_t(u32, attr->atimensec, NSEC_PER_SEC - 1); attr->mtimensec = min_t(u32, attr->mtimensec, NSEC_PER_SEC - 1); attr->ctimensec = min_t(u32, attr->ctimensec, NSEC_PER_SEC - 1); inode_set_atime(inode, attr->atime, attr->atimensec); /* mtime from server may be stale due to local buffered write */ if (!(cache_mask & STATX_MTIME)) { inode_set_mtime(inode, attr->mtime, attr->mtimensec); } if (!(cache_mask & STATX_CTIME)) { inode_set_ctime(inode, attr->ctime, attr->ctimensec); } if (sx) { /* Sanitize nsecs */ sx->btime.tv_nsec = min_t(u32, sx->btime.tv_nsec, NSEC_PER_SEC - 1); /* * Btime has been queried, cache is valid (whether or not btime * is available or not) so clear STATX_BTIME from inval_mask. * * Availability of the btime attribute is indicated in * FUSE_I_BTIME */ set_mask_bits(&fi->inval_mask, STATX_BTIME, 0); if (sx->mask & STATX_BTIME) { set_bit(FUSE_I_BTIME, &fi->state); fi->i_btime.tv_sec = sx->btime.tv_sec; fi->i_btime.tv_nsec = sx->btime.tv_nsec; } } if (attr->blksize != 0) inode->i_blkbits = ilog2(attr->blksize); else inode->i_blkbits = inode->i_sb->s_blocksize_bits; /* * Don't set the sticky bit in i_mode, unless we want the VFS * to check permissions. This prevents failures due to the * check in may_delete(). */ fi->orig_i_mode = inode->i_mode; if (!fc->default_permissions) inode->i_mode &= ~S_ISVTX; fi->orig_ino = attr->ino; /* * We are refreshing inode data and it is possible that another * client set suid/sgid or security.capability xattr. So clear * S_NOSEC. Ideally, we could have cleared it only if suid/sgid * was set or if security.capability xattr was set. But we don't * know if security.capability has been set or not. So clear it * anyway. Its less efficient but should be safe. */ inode->i_flags &= ~S_NOSEC; } u32 fuse_get_cache_mask(struct inode *inode) { struct fuse_conn *fc = get_fuse_conn(inode); if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) return 0; return STATX_MTIME | STATX_CTIME | STATX_SIZE; } static void fuse_change_attributes_i(struct inode *inode, struct fuse_attr *attr, struct fuse_statx *sx, u64 attr_valid, u64 attr_version, u64 evict_ctr) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); u32 cache_mask; loff_t oldsize; struct timespec64 old_mtime; spin_lock(&fi->lock); /* * In case of writeback_cache enabled, writes update mtime, ctime and * may update i_size. In these cases trust the cached value in the * inode. */ cache_mask = fuse_get_cache_mask(inode); if (cache_mask & STATX_SIZE) attr->size = i_size_read(inode); if (cache_mask & STATX_MTIME) { attr->mtime = inode_get_mtime_sec(inode); attr->mtimensec = inode_get_mtime_nsec(inode); } if (cache_mask & STATX_CTIME) { attr->ctime = inode_get_ctime_sec(inode); attr->ctimensec = inode_get_ctime_nsec(inode); } if ((attr_version != 0 && fi->attr_version > attr_version) || test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) { spin_unlock(&fi->lock); return; } old_mtime = inode_get_mtime(inode); fuse_change_attributes_common(inode, attr, sx, attr_valid, cache_mask, evict_ctr); oldsize = inode->i_size; /* * In case of writeback_cache enabled, the cached writes beyond EOF * extend local i_size without keeping userspace server in sync. So, * attr->size coming from server can be stale. We cannot trust it. */ if (!(cache_mask & STATX_SIZE)) i_size_write(inode, attr->size); spin_unlock(&fi->lock); if (!cache_mask && S_ISREG(inode->i_mode)) { bool inval = false; if (oldsize != attr->size) { truncate_pagecache(inode, attr->size); if (!fc->explicit_inval_data) inval = true; } else if (fc->auto_inval_data) { struct timespec64 new_mtime = { .tv_sec = attr->mtime, .tv_nsec = attr->mtimensec, }; /* * Auto inval mode also checks and invalidates if mtime * has changed. */ if (!timespec64_equal(&old_mtime, &new_mtime)) inval = true; } if (inval) invalidate_inode_pages2(inode->i_mapping); } if (IS_ENABLED(CONFIG_FUSE_DAX)) fuse_dax_dontcache(inode, attr->flags); } void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, struct fuse_statx *sx, u64 attr_valid, u64 attr_version) { fuse_change_attributes_i(inode, attr, sx, attr_valid, attr_version, 0); } static void fuse_init_submount_lookup(struct fuse_submount_lookup *sl, u64 nodeid) { sl->nodeid = nodeid; refcount_set(&sl->count, 1); } static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr, struct fuse_conn *fc) { inode->i_mode = attr->mode & S_IFMT; inode->i_size = attr->size; inode_set_mtime(inode, attr->mtime, attr->mtimensec); inode_set_ctime(inode, attr->ctime, attr->ctimensec); if (S_ISREG(inode->i_mode)) { fuse_init_common(inode); fuse_init_file_inode(inode, attr->flags); } else if (S_ISDIR(inode->i_mode)) fuse_init_dir(inode); else if (S_ISLNK(inode->i_mode)) fuse_init_symlink(inode); else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { fuse_init_common(inode); init_special_inode(inode, inode->i_mode, new_decode_dev(attr->rdev)); } else BUG(); /* * Ensure that we don't cache acls for daemons without FUSE_POSIX_ACL * so they see the exact same behavior as before. */ if (!fc->posix_acl) inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE; } static int fuse_inode_eq(struct inode *inode, void *_nodeidp) { u64 nodeid = *(u64 *) _nodeidp; if (get_node_id(inode) == nodeid) return 1; else return 0; } static int fuse_inode_set(struct inode *inode, void *_nodeidp) { u64 nodeid = *(u64 *) _nodeidp; get_fuse_inode(inode)->nodeid = nodeid; return 0; } struct inode *fuse_iget(struct super_block *sb, u64 nodeid, int generation, struct fuse_attr *attr, u64 attr_valid, u64 attr_version, u64 evict_ctr) { struct inode *inode; struct fuse_inode *fi; struct fuse_conn *fc = get_fuse_conn_super(sb); /* * Auto mount points get their node id from the submount root, which is * not a unique identifier within this filesystem. * * To avoid conflicts, do not place submount points into the inode hash * table. */ if (fc->auto_submounts && (attr->flags & FUSE_ATTR_SUBMOUNT) && S_ISDIR(attr->mode)) { struct fuse_inode *fi; inode = new_inode(sb); if (!inode) return NULL; fuse_init_inode(inode, attr, fc); fi = get_fuse_inode(inode); fi->nodeid = nodeid; fi->submount_lookup = fuse_alloc_submount_lookup(); if (!fi->submount_lookup) { iput(inode); return NULL; } /* Sets nlookup = 1 on fi->submount_lookup->nlookup */ fuse_init_submount_lookup(fi->submount_lookup, nodeid); inode->i_flags |= S_AUTOMOUNT; goto done; } retry: inode = iget5_locked(sb, nodeid, fuse_inode_eq, fuse_inode_set, &nodeid); if (!inode) return NULL; if ((inode->i_state & I_NEW)) { inode->i_flags |= S_NOATIME; if (!fc->writeback_cache || !S_ISREG(attr->mode)) inode->i_flags |= S_NOCMTIME; inode->i_generation = generation; fuse_init_inode(inode, attr, fc); unlock_new_inode(inode); } else if (fuse_stale_inode(inode, generation, attr)) { /* nodeid was reused, any I/O on the old inode should fail */ fuse_make_bad(inode); if (inode != d_inode(sb->s_root)) { remove_inode_hash(inode); iput(inode); goto retry; } } fi = get_fuse_inode(inode); spin_lock(&fi->lock); fi->nlookup++; spin_unlock(&fi->lock); done: fuse_change_attributes_i(inode, attr, NULL, attr_valid, attr_version, evict_ctr); return inode; } struct inode *fuse_ilookup(struct fuse_conn *fc, u64 nodeid, struct fuse_mount **fm) { struct fuse_mount *fm_iter; struct inode *inode; WARN_ON(!rwsem_is_locked(&fc->killsb)); list_for_each_entry(fm_iter, &fc->mounts, fc_entry) { if (!fm_iter->sb) continue; inode = ilookup5(fm_iter->sb, nodeid, fuse_inode_eq, &nodeid); if (inode) { if (fm) *fm = fm_iter; return inode; } } return NULL; } int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, loff_t offset, loff_t len) { struct fuse_inode *fi; struct inode *inode; pgoff_t pg_start; pgoff_t pg_end; inode = fuse_ilookup(fc, nodeid, NULL); if (!inode) return -ENOENT; fi = get_fuse_inode(inode); spin_lock(&fi->lock); fi->attr_version = atomic64_inc_return(&fc->attr_version); spin_unlock(&fi->lock); fuse_invalidate_attr(inode); forget_all_cached_acls(inode); if (offset >= 0) { pg_start = offset >> PAGE_SHIFT; if (len <= 0) pg_end = -1; else pg_end = (offset + len - 1) >> PAGE_SHIFT; invalidate_inode_pages2_range(inode->i_mapping, pg_start, pg_end); } iput(inode); return 0; } bool fuse_lock_inode(struct inode *inode) { bool locked = false; if (!get_fuse_conn(inode)->parallel_dirops) { mutex_lock(&get_fuse_inode(inode)->mutex); locked = true; } return locked; } void fuse_unlock_inode(struct inode *inode, bool locked) { if (locked) mutex_unlock(&get_fuse_inode(inode)->mutex); } static void fuse_umount_begin(struct super_block *sb) { struct fuse_conn *fc = get_fuse_conn_super(sb); if (fc->no_force_umount) return; fuse_abort_conn(fc); // Only retire block-device-based superblocks. if (sb->s_bdev != NULL) retire_super(sb); } static void fuse_send_destroy(struct fuse_mount *fm) { if (fm->fc->conn_init) { FUSE_ARGS(args); args.opcode = FUSE_DESTROY; args.force = true; args.nocreds = true; fuse_simple_request(fm, &args); } } static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr) { stbuf->f_type = FUSE_SUPER_MAGIC; stbuf->f_bsize = attr->bsize; stbuf->f_frsize = attr->frsize; stbuf->f_blocks = attr->blocks; stbuf->f_bfree = attr->bfree; stbuf->f_bavail = attr->bavail; stbuf->f_files = attr->files; stbuf->f_ffree = attr->ffree; stbuf->f_namelen = attr->namelen; /* fsid is left zero */ } static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct fuse_mount *fm = get_fuse_mount_super(sb); FUSE_ARGS(args); struct fuse_statfs_out outarg; int err; if (!fuse_allow_current_process(fm->fc)) { buf->f_type = FUSE_SUPER_MAGIC; return 0; } memset(&outarg, 0, sizeof(outarg)); args.in_numargs = 0; args.opcode = FUSE_STATFS; args.nodeid = get_node_id(d_inode(dentry)); args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; err = fuse_simple_request(fm, &args); if (!err) convert_fuse_statfs(buf, &outarg.st); return err; } static struct fuse_sync_bucket *fuse_sync_bucket_alloc(void) { struct fuse_sync_bucket *bucket; bucket = kzalloc(sizeof(*bucket), GFP_KERNEL | __GFP_NOFAIL); if (bucket) { init_waitqueue_head(&bucket->waitq); /* Initial active count */ atomic_set(&bucket->count, 1); } return bucket; } static void fuse_sync_fs_writes(struct fuse_conn *fc) { struct fuse_sync_bucket *bucket, *new_bucket; int count; new_bucket = fuse_sync_bucket_alloc(); spin_lock(&fc->lock); bucket = rcu_dereference_protected(fc->curr_bucket, 1); count = atomic_read(&bucket->count); WARN_ON(count < 1); /* No outstanding writes? */ if (count == 1) { spin_unlock(&fc->lock); kfree(new_bucket); return; } /* * Completion of new bucket depends on completion of this bucket, so add * one more count. */ atomic_inc(&new_bucket->count); rcu_assign_pointer(fc->curr_bucket, new_bucket); spin_unlock(&fc->lock); /* * Drop initial active count. At this point if all writes in this and * ancestor buckets complete, the count will go to zero and this task * will be woken up. */ atomic_dec(&bucket->count); wait_event(bucket->waitq, atomic_read(&bucket->count) == 0); /* Drop temp count on descendant bucket */ fuse_sync_bucket_dec(new_bucket); kfree_rcu(bucket, rcu); } static int fuse_sync_fs(struct super_block *sb, int wait) { struct fuse_mount *fm = get_fuse_mount_super(sb); struct fuse_conn *fc = fm->fc; struct fuse_syncfs_in inarg; FUSE_ARGS(args); int err; /* * Userspace cannot handle the wait == 0 case. Avoid a * gratuitous roundtrip. */ if (!wait) return 0; /* The filesystem is being unmounted. Nothing to do. */ if (!sb->s_root) return 0; if (!fc->sync_fs) return 0; fuse_sync_fs_writes(fc); memset(&inarg, 0, sizeof(inarg)); args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; args.opcode = FUSE_SYNCFS; args.nodeid = get_node_id(sb->s_root->d_inode); args.out_numargs = 0; err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { fc->sync_fs = 0; err = 0; } return err; } enum { OPT_SOURCE, OPT_SUBTYPE, OPT_FD, OPT_ROOTMODE, OPT_USER_ID, OPT_GROUP_ID, OPT_DEFAULT_PERMISSIONS, OPT_ALLOW_OTHER, OPT_MAX_READ, OPT_BLKSIZE, OPT_ERR }; static const struct fs_parameter_spec fuse_fs_parameters[] = { fsparam_string ("source", OPT_SOURCE), fsparam_u32 ("fd", OPT_FD), fsparam_u32oct ("rootmode", OPT_ROOTMODE), fsparam_uid ("user_id", OPT_USER_ID), fsparam_gid ("group_id", OPT_GROUP_ID), fsparam_flag ("default_permissions", OPT_DEFAULT_PERMISSIONS), fsparam_flag ("allow_other", OPT_ALLOW_OTHER), fsparam_u32 ("max_read", OPT_MAX_READ), fsparam_u32 ("blksize", OPT_BLKSIZE), fsparam_string ("subtype", OPT_SUBTYPE), {} }; static int fuse_parse_param(struct fs_context *fsc, struct fs_parameter *param) { struct fs_parse_result result; struct fuse_fs_context *ctx = fsc->fs_private; int opt; kuid_t kuid; kgid_t kgid; if (fsc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { /* * Ignore options coming from mount(MS_REMOUNT) for backward * compatibility. */ if (fsc->oldapi) return 0; return invalfc(fsc, "No changes allowed in reconfigure"); } opt = fs_parse(fsc, fuse_fs_parameters, param, &result); if (opt < 0) return opt; switch (opt) { case OPT_SOURCE: if (fsc->source) return invalfc(fsc, "Multiple sources specified"); fsc->source = param->string; param->string = NULL; break; case OPT_SUBTYPE: if (ctx->subtype) return invalfc(fsc, "Multiple subtypes specified"); ctx->subtype = param->string; param->string = NULL; return 0; case OPT_FD: ctx->fd = result.uint_32; ctx->fd_present = true; break; case OPT_ROOTMODE: if (!fuse_valid_type(result.uint_32)) return invalfc(fsc, "Invalid rootmode"); ctx->rootmode = result.uint_32; ctx->rootmode_present = true; break; case OPT_USER_ID: kuid = result.uid; /* * The requested uid must be representable in the * filesystem's idmapping. */ if (!kuid_has_mapping(fsc->user_ns, kuid)) return invalfc(fsc, "Invalid user_id"); ctx->user_id = kuid; ctx->user_id_present = true; break; case OPT_GROUP_ID: kgid = result.gid; /* * The requested gid must be representable in the * filesystem's idmapping. */ if (!kgid_has_mapping(fsc->user_ns, kgid)) return invalfc(fsc, "Invalid group_id"); ctx->group_id = kgid; ctx->group_id_present = true; break; case OPT_DEFAULT_PERMISSIONS: ctx->default_permissions = true; break; case OPT_ALLOW_OTHER: ctx->allow_other = true; break; case OPT_MAX_READ: ctx->max_read = result.uint_32; break; case OPT_BLKSIZE: if (!ctx->is_bdev) return invalfc(fsc, "blksize only supported for fuseblk"); ctx->blksize = result.uint_32; break; default: return -EINVAL; } return 0; } static void fuse_free_fsc(struct fs_context *fsc) { struct fuse_fs_context *ctx = fsc->fs_private; if (ctx) { kfree(ctx->subtype); kfree(ctx); } } static int fuse_show_options(struct seq_file *m, struct dentry *root) { struct super_block *sb = root->d_sb; struct fuse_conn *fc = get_fuse_conn_super(sb); if (fc->legacy_opts_show) { seq_printf(m, ",user_id=%u", from_kuid_munged(fc->user_ns, fc->user_id)); seq_printf(m, ",group_id=%u", from_kgid_munged(fc->user_ns, fc->group_id)); if (fc->default_permissions) seq_puts(m, ",default_permissions"); if (fc->allow_other) seq_puts(m, ",allow_other"); if (fc->max_read != ~0) seq_printf(m, ",max_read=%u", fc->max_read); if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE) seq_printf(m, ",blksize=%lu", sb->s_blocksize); } #ifdef CONFIG_FUSE_DAX if (fc->dax_mode == FUSE_DAX_ALWAYS) seq_puts(m, ",dax=always"); else if (fc->dax_mode == FUSE_DAX_NEVER) seq_puts(m, ",dax=never"); else if (fc->dax_mode == FUSE_DAX_INODE_USER) seq_puts(m, ",dax=inode"); #endif return 0; } static void fuse_iqueue_init(struct fuse_iqueue *fiq, const struct fuse_iqueue_ops *ops, void *priv) { memset(fiq, 0, sizeof(struct fuse_iqueue)); spin_lock_init(&fiq->lock); init_waitqueue_head(&fiq->waitq); INIT_LIST_HEAD(&fiq->pending); INIT_LIST_HEAD(&fiq->interrupts); fiq->forget_list_tail = &fiq->forget_list_head; fiq->connected = 1; fiq->ops = ops; fiq->priv = priv; } static void fuse_pqueue_init(struct fuse_pqueue *fpq) { unsigned int i; spin_lock_init(&fpq->lock); for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) INIT_LIST_HEAD(&fpq->processing[i]); INIT_LIST_HEAD(&fpq->io); fpq->connected = 1; } void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, struct user_namespace *user_ns, const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv) { memset(fc, 0, sizeof(*fc)); spin_lock_init(&fc->lock); spin_lock_init(&fc->bg_lock); init_rwsem(&fc->killsb); refcount_set(&fc->count, 1); atomic_set(&fc->dev_count, 1); init_waitqueue_head(&fc->blocked_waitq); fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv); INIT_LIST_HEAD(&fc->bg_queue); INIT_LIST_HEAD(&fc->entry); INIT_LIST_HEAD(&fc->devices); atomic_set(&fc->num_waiting, 0); fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND; fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD; atomic64_set(&fc->khctr, 0); fc->polled_files = RB_ROOT; fc->blocked = 0; fc->initialized = 0; fc->connected = 1; atomic64_set(&fc->attr_version, 1); atomic64_set(&fc->evict_ctr, 1); get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); fc->pid_ns = get_pid_ns(task_active_pid_ns(current)); fc->user_ns = get_user_ns(user_ns); fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; fc->max_pages_limit = fuse_max_pages_limit; if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) fuse_backing_files_init(fc); INIT_LIST_HEAD(&fc->mounts); list_add(&fm->fc_entry, &fc->mounts); fm->fc = fc; } EXPORT_SYMBOL_GPL(fuse_conn_init); static void delayed_release(struct rcu_head *p) { struct fuse_conn *fc = container_of(p, struct fuse_conn, rcu); put_user_ns(fc->user_ns); fc->release(fc); } void fuse_conn_put(struct fuse_conn *fc) { if (refcount_dec_and_test(&fc->count)) { struct fuse_iqueue *fiq = &fc->iq; struct fuse_sync_bucket *bucket; if (IS_ENABLED(CONFIG_FUSE_DAX)) fuse_dax_conn_free(fc); if (fiq->ops->release) fiq->ops->release(fiq); put_pid_ns(fc->pid_ns); bucket = rcu_dereference_protected(fc->curr_bucket, 1); if (bucket) { WARN_ON(atomic_read(&bucket->count) != 1); kfree(bucket); } if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) fuse_backing_files_free(fc); call_rcu(&fc->rcu, delayed_release); } } EXPORT_SYMBOL_GPL(fuse_conn_put); struct fuse_conn *fuse_conn_get(struct fuse_conn *fc) { refcount_inc(&fc->count); return fc; } EXPORT_SYMBOL_GPL(fuse_conn_get); static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode) { struct fuse_attr attr; memset(&attr, 0, sizeof(attr)); attr.mode = mode; attr.ino = FUSE_ROOT_ID; attr.nlink = 1; return fuse_iget(sb, FUSE_ROOT_ID, 0, &attr, 0, 0, 0); } struct fuse_inode_handle { u64 nodeid; u32 generation; }; static struct dentry *fuse_get_dentry(struct super_block *sb, struct fuse_inode_handle *handle) { struct fuse_conn *fc = get_fuse_conn_super(sb); struct inode *inode; struct dentry *entry; int err = -ESTALE; if (handle->nodeid == 0) goto out_err; inode = ilookup5(sb, handle->nodeid, fuse_inode_eq, &handle->nodeid); if (!inode) { struct fuse_entry_out outarg; const struct qstr name = QSTR_INIT(".", 1); if (!fc->export_support) goto out_err; err = fuse_lookup_name(sb, handle->nodeid, &name, &outarg, &inode); if (err && err != -ENOENT) goto out_err; if (err || !inode) { err = -ESTALE; goto out_err; } err = -EIO; if (get_node_id(inode) != handle->nodeid) goto out_iput; } err = -ESTALE; if (inode->i_generation != handle->generation) goto out_iput; entry = d_obtain_alias(inode); if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID) fuse_invalidate_entry_cache(entry); return entry; out_iput: iput(inode); out_err: return ERR_PTR(err); } static int fuse_encode_fh(struct inode *inode, u32 *fh, int *max_len, struct inode *parent) { int len = parent ? 6 : 3; u64 nodeid; u32 generation; if (*max_len < len) { *max_len = len; return FILEID_INVALID; } nodeid = get_fuse_inode(inode)->nodeid; generation = inode->i_generation; fh[0] = (u32)(nodeid >> 32); fh[1] = (u32)(nodeid & 0xffffffff); fh[2] = generation; if (parent) { nodeid = get_fuse_inode(parent)->nodeid; generation = parent->i_generation; fh[3] = (u32)(nodeid >> 32); fh[4] = (u32)(nodeid & 0xffffffff); fh[5] = generation; } *max_len = len; return parent ? FILEID_INO64_GEN_PARENT : FILEID_INO64_GEN; } static struct dentry *fuse_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { struct fuse_inode_handle handle; if ((fh_type != FILEID_INO64_GEN && fh_type != FILEID_INO64_GEN_PARENT) || fh_len < 3) return NULL; handle.nodeid = (u64) fid->raw[0] << 32; handle.nodeid |= (u64) fid->raw[1]; handle.generation = fid->raw[2]; return fuse_get_dentry(sb, &handle); } static struct dentry *fuse_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { struct fuse_inode_handle parent; if (fh_type != FILEID_INO64_GEN_PARENT || fh_len < 6) return NULL; parent.nodeid = (u64) fid->raw[3] << 32; parent.nodeid |= (u64) fid->raw[4]; parent.generation = fid->raw[5]; return fuse_get_dentry(sb, &parent); } static struct dentry *fuse_get_parent(struct dentry *child) { struct inode *child_inode = d_inode(child); struct fuse_conn *fc = get_fuse_conn(child_inode); struct inode *inode; struct dentry *parent; struct fuse_entry_out outarg; int err; if (!fc->export_support) return ERR_PTR(-ESTALE); err = fuse_lookup_name(child_inode->i_sb, get_node_id(child_inode), &dotdot_name, &outarg, &inode); if (err) { if (err == -ENOENT) return ERR_PTR(-ESTALE); return ERR_PTR(err); } parent = d_obtain_alias(inode); if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID) fuse_invalidate_entry_cache(parent); return parent; } /* only for fid encoding; no support for file handle */ static const struct export_operations fuse_export_fid_operations = { .encode_fh = fuse_encode_fh, }; static const struct export_operations fuse_export_operations = { .fh_to_dentry = fuse_fh_to_dentry, .fh_to_parent = fuse_fh_to_parent, .encode_fh = fuse_encode_fh, .get_parent = fuse_get_parent, }; static const struct super_operations fuse_super_operations = { .alloc_inode = fuse_alloc_inode, .free_inode = fuse_free_inode, .evict_inode = fuse_evict_inode, .write_inode = fuse_write_inode, .drop_inode = generic_delete_inode, .umount_begin = fuse_umount_begin, .statfs = fuse_statfs, .sync_fs = fuse_sync_fs, .show_options = fuse_show_options, }; static void sanitize_global_limit(unsigned *limit) { /* * The default maximum number of async requests is calculated to consume * 1/2^13 of the total memory, assuming 392 bytes per request. */ if (*limit == 0) *limit = ((totalram_pages() << PAGE_SHIFT) >> 13) / 392; if (*limit >= 1 << 16) *limit = (1 << 16) - 1; } static int set_global_limit(const char *val, const struct kernel_param *kp) { int rv; rv = param_set_uint(val, kp); if (rv) return rv; sanitize_global_limit((unsigned *)kp->arg); return 0; } static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg) { int cap_sys_admin = capable(CAP_SYS_ADMIN); if (arg->minor < 13) return; sanitize_global_limit(&max_user_bgreq); sanitize_global_limit(&max_user_congthresh); spin_lock(&fc->bg_lock); if (arg->max_background) { fc->max_background = arg->max_background; if (!cap_sys_admin && fc->max_background > max_user_bgreq) fc->max_background = max_user_bgreq; } if (arg->congestion_threshold) { fc->congestion_threshold = arg->congestion_threshold; if (!cap_sys_admin && fc->congestion_threshold > max_user_congthresh) fc->congestion_threshold = max_user_congthresh; } spin_unlock(&fc->bg_lock); } struct fuse_init_args { struct fuse_args args; struct fuse_init_in in; struct fuse_init_out out; }; static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, int error) { struct fuse_conn *fc = fm->fc; struct fuse_init_args *ia = container_of(args, typeof(*ia), args); struct fuse_init_out *arg = &ia->out; bool ok = true; if (error || arg->major != FUSE_KERNEL_VERSION) ok = false; else { unsigned long ra_pages; process_init_limits(fc, arg); if (arg->minor >= 6) { u64 flags = arg->flags; if (flags & FUSE_INIT_EXT) flags |= (u64) arg->flags2 << 32; ra_pages = arg->max_readahead / PAGE_SIZE; if (flags & FUSE_ASYNC_READ) fc->async_read = 1; if (!(flags & FUSE_POSIX_LOCKS)) fc->no_lock = 1; if (arg->minor >= 17) { if (!(flags & FUSE_FLOCK_LOCKS)) fc->no_flock = 1; } else { if (!(flags & FUSE_POSIX_LOCKS)) fc->no_flock = 1; } if (flags & FUSE_ATOMIC_O_TRUNC) fc->atomic_o_trunc = 1; if (arg->minor >= 9) { /* LOOKUP has dependency on proto version */ if (flags & FUSE_EXPORT_SUPPORT) fc->export_support = 1; } if (flags & FUSE_BIG_WRITES) fc->big_writes = 1; if (flags & FUSE_DONT_MASK) fc->dont_mask = 1; if (flags & FUSE_AUTO_INVAL_DATA) fc->auto_inval_data = 1; else if (flags & FUSE_EXPLICIT_INVAL_DATA) fc->explicit_inval_data = 1; if (flags & FUSE_DO_READDIRPLUS) { fc->do_readdirplus = 1; if (flags & FUSE_READDIRPLUS_AUTO) fc->readdirplus_auto = 1; } if (flags & FUSE_ASYNC_DIO) fc->async_dio = 1; if (flags & FUSE_WRITEBACK_CACHE) fc->writeback_cache = 1; if (flags & FUSE_PARALLEL_DIROPS) fc->parallel_dirops = 1; if (flags & FUSE_HANDLE_KILLPRIV) fc->handle_killpriv = 1; if (arg->time_gran && arg->time_gran <= 1000000000) fm->sb->s_time_gran = arg->time_gran; if ((flags & FUSE_POSIX_ACL)) { fc->default_permissions = 1; fc->posix_acl = 1; } if (flags & FUSE_CACHE_SYMLINKS) fc->cache_symlinks = 1; if (flags & FUSE_ABORT_ERROR) fc->abort_err = 1; if (flags & FUSE_MAX_PAGES) { fc->max_pages = min_t(unsigned int, fc->max_pages_limit, max_t(unsigned int, arg->max_pages, 1)); } if (IS_ENABLED(CONFIG_FUSE_DAX)) { if (flags & FUSE_MAP_ALIGNMENT && !fuse_dax_check_alignment(fc, arg->map_alignment)) { ok = false; } if (flags & FUSE_HAS_INODE_DAX) fc->inode_dax = 1; } if (flags & FUSE_HANDLE_KILLPRIV_V2) { fc->handle_killpriv_v2 = 1; fm->sb->s_flags |= SB_NOSEC; } if (flags & FUSE_SETXATTR_EXT) fc->setxattr_ext = 1; if (flags & FUSE_SECURITY_CTX) fc->init_security = 1; if (flags & FUSE_CREATE_SUPP_GROUP) fc->create_supp_group = 1; if (flags & FUSE_DIRECT_IO_ALLOW_MMAP) fc->direct_io_allow_mmap = 1; /* * max_stack_depth is the max stack depth of FUSE fs, * so it has to be at least 1 to support passthrough * to backing files. * * with max_stack_depth > 1, the backing files can be * on a stacked fs (e.g. overlayfs) themselves and with * max_stack_depth == 1, FUSE fs can be stacked as the * underlying fs of a stacked fs (e.g. overlayfs). * * Also don't allow the combination of FUSE_PASSTHROUGH * and FUSE_WRITEBACK_CACHE, current design doesn't handle * them together. */ if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH) && (flags & FUSE_PASSTHROUGH) && arg->max_stack_depth > 0 && arg->max_stack_depth <= FILESYSTEM_MAX_STACK_DEPTH && !(flags & FUSE_WRITEBACK_CACHE)) { fc->passthrough = 1; fc->max_stack_depth = arg->max_stack_depth; fm->sb->s_stack_depth = arg->max_stack_depth; } if (flags & FUSE_NO_EXPORT_SUPPORT) fm->sb->s_export_op = &fuse_export_fid_operations; if (flags & FUSE_ALLOW_IDMAP) { if (fc->default_permissions) fm->sb->s_iflags &= ~SB_I_NOIDMAP; else ok = false; } } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; fc->no_flock = 1; } fm->sb->s_bdi->ra_pages = min(fm->sb->s_bdi->ra_pages, ra_pages); fc->minor = arg->minor; fc->max_write = arg->minor < 5 ? 4096 : arg->max_write; fc->max_write = max_t(unsigned, 4096, fc->max_write); fc->conn_init = 1; } kfree(ia); if (!ok) { fc->conn_init = 0; fc->conn_error = 1; } fuse_set_initialized(fc); wake_up_all(&fc->blocked_waitq); } void fuse_send_init(struct fuse_mount *fm) { struct fuse_init_args *ia; u64 flags; ia = kzalloc(sizeof(*ia), GFP_KERNEL | __GFP_NOFAIL); ia->in.major = FUSE_KERNEL_VERSION; ia->in.minor = FUSE_KERNEL_MINOR_VERSION; ia->in.max_readahead = fm->sb->s_bdi->ra_pages * PAGE_SIZE; flags = FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT | FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA | FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP | FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP | FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND | FUSE_ALLOW_IDMAP; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) flags |= FUSE_MAP_ALIGNMENT; if (fuse_is_inode_dax_mode(fm->fc->dax_mode)) flags |= FUSE_HAS_INODE_DAX; #endif if (fm->fc->auto_submounts) flags |= FUSE_SUBMOUNTS; if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) flags |= FUSE_PASSTHROUGH; ia->in.flags = flags; ia->in.flags2 = flags >> 32; ia->args.opcode = FUSE_INIT; ia->args.in_numargs = 1; ia->args.in_args[0].size = sizeof(ia->in); ia->args.in_args[0].value = &ia->in; ia->args.out_numargs = 1; /* Variable length argument used for backward compatibility with interface version < 7.5. Rest of init_out is zeroed by do_get_request(), so a short reply is not a problem */ ia->args.out_argvar = true; ia->args.out_args[0].size = sizeof(ia->out); ia->args.out_args[0].value = &ia->out; ia->args.force = true; ia->args.nocreds = true; ia->args.end = process_init_reply; if (fuse_simple_background(fm, &ia->args, GFP_KERNEL) != 0) process_init_reply(fm, &ia->args, -ENOTCONN); } EXPORT_SYMBOL_GPL(fuse_send_init); void fuse_free_conn(struct fuse_conn *fc) { WARN_ON(!list_empty(&fc->devices)); kfree(fc); } EXPORT_SYMBOL_GPL(fuse_free_conn); static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) { int err; char *suffix = ""; if (sb->s_bdev) { suffix = "-fuseblk"; /* * sb->s_bdi points to blkdev's bdi however we want to redirect * it to our private bdi... */ bdi_put(sb->s_bdi); sb->s_bdi = &noop_backing_dev_info; } err = super_setup_bdi_name(sb, "%u:%u%s", MAJOR(fc->dev), MINOR(fc->dev), suffix); if (err) return err; /* fuse does it's own writeback accounting */ sb->s_bdi->capabilities &= ~BDI_CAP_WRITEBACK_ACCT; sb->s_bdi->capabilities |= BDI_CAP_STRICTLIMIT; /* * For a single fuse filesystem use max 1% of dirty + * writeback threshold. * * This gives about 1M of write buffer for memory maps on a * machine with 1G and 10% dirty_ratio, which should be more * than enough. * * Privileged users can raise it by writing to * * /sys/class/bdi/<bdi>/max_ratio */ bdi_set_max_ratio(sb->s_bdi, 1); return 0; } struct fuse_dev *fuse_dev_alloc(void) { struct fuse_dev *fud; struct list_head *pq; fud = kzalloc(sizeof(struct fuse_dev), GFP_KERNEL); if (!fud) return NULL; pq = kcalloc(FUSE_PQ_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL); if (!pq) { kfree(fud); return NULL; } fud->pq.processing = pq; fuse_pqueue_init(&fud->pq); return fud; } EXPORT_SYMBOL_GPL(fuse_dev_alloc); void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc) { fud->fc = fuse_conn_get(fc); spin_lock(&fc->lock); list_add_tail(&fud->entry, &fc->devices); spin_unlock(&fc->lock); } EXPORT_SYMBOL_GPL(fuse_dev_install); struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc) { struct fuse_dev *fud; fud = fuse_dev_alloc(); if (!fud) return NULL; fuse_dev_install(fud, fc); return fud; } EXPORT_SYMBOL_GPL(fuse_dev_alloc_install); void fuse_dev_free(struct fuse_dev *fud) { struct fuse_conn *fc = fud->fc; if (fc) { spin_lock(&fc->lock); list_del(&fud->entry); spin_unlock(&fc->lock); fuse_conn_put(fc); } kfree(fud->pq.processing); kfree(fud); } EXPORT_SYMBOL_GPL(fuse_dev_free); static void fuse_fill_attr_from_inode(struct fuse_attr *attr, const struct fuse_inode *fi) { struct timespec64 atime = inode_get_atime(&fi->inode); struct timespec64 mtime = inode_get_mtime(&fi->inode); struct timespec64 ctime = inode_get_ctime(&fi->inode); *attr = (struct fuse_attr){ .ino = fi->inode.i_ino, .size = fi->inode.i_size, .blocks = fi->inode.i_blocks, .atime = atime.tv_sec, .mtime = mtime.tv_sec, .ctime = ctime.tv_sec, .atimensec = atime.tv_nsec, .mtimensec = mtime.tv_nsec, .ctimensec = ctime.tv_nsec, .mode = fi->inode.i_mode, .nlink = fi->inode.i_nlink, .uid = __kuid_val(fi->inode.i_uid), .gid = __kgid_val(fi->inode.i_gid), .rdev = fi->inode.i_rdev, .blksize = 1u << fi->inode.i_blkbits, }; } static void fuse_sb_defaults(struct super_block *sb) { sb->s_magic = FUSE_SUPER_MAGIC; sb->s_op = &fuse_super_operations; sb->s_xattr = fuse_xattr_handlers; sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_time_gran = 1; sb->s_export_op = &fuse_export_operations; sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE; sb->s_iflags |= SB_I_NOIDMAP; if (sb->s_user_ns != &init_user_ns) sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER; sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); } static int fuse_fill_super_submount(struct super_block *sb, struct fuse_inode *parent_fi) { struct fuse_mount *fm = get_fuse_mount_super(sb); struct super_block *parent_sb = parent_fi->inode.i_sb; struct fuse_attr root_attr; struct inode *root; struct fuse_submount_lookup *sl; struct fuse_inode *fi; fuse_sb_defaults(sb); fm->sb = sb; WARN_ON(sb->s_bdi != &noop_backing_dev_info); sb->s_bdi = bdi_get(parent_sb->s_bdi); sb->s_xattr = parent_sb->s_xattr; sb->s_export_op = parent_sb->s_export_op; sb->s_time_gran = parent_sb->s_time_gran; sb->s_blocksize = parent_sb->s_blocksize; sb->s_blocksize_bits = parent_sb->s_blocksize_bits; sb->s_subtype = kstrdup(parent_sb->s_subtype, GFP_KERNEL); if (parent_sb->s_subtype && !sb->s_subtype) return -ENOMEM; fuse_fill_attr_from_inode(&root_attr, parent_fi); root = fuse_iget(sb, parent_fi->nodeid, 0, &root_attr, 0, 0, fuse_get_evict_ctr(fm->fc)); /* * This inode is just a duplicate, so it is not looked up and * its nlookup should not be incremented. fuse_iget() does * that, though, so undo it here. */ fi = get_fuse_inode(root); fi->nlookup--; sb->s_d_op = &fuse_dentry_operations; sb->s_root = d_make_root(root); if (!sb->s_root) return -ENOMEM; /* * Grab the parent's submount_lookup pointer and take a * reference on the shared nlookup from the parent. This is to * prevent the last forget for this nodeid from getting * triggered until all users have finished with it. */ sl = parent_fi->submount_lookup; WARN_ON(!sl); if (sl) { refcount_inc(&sl->count); fi->submount_lookup = sl; } return 0; } /* Filesystem context private data holds the FUSE inode of the mount point */ static int fuse_get_tree_submount(struct fs_context *fsc) { struct fuse_mount *fm; struct fuse_inode *mp_fi = fsc->fs_private; struct fuse_conn *fc = get_fuse_conn(&mp_fi->inode); struct super_block *sb; int err; fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL); if (!fm) return -ENOMEM; fm->fc = fuse_conn_get(fc); fsc->s_fs_info = fm; sb = sget_fc(fsc, NULL, set_anon_super_fc); if (fsc->s_fs_info) fuse_mount_destroy(fm); if (IS_ERR(sb)) return PTR_ERR(sb); /* Initialize superblock, making @mp_fi its root */ err = fuse_fill_super_submount(sb, mp_fi); if (err) { deactivate_locked_super(sb); return err; } down_write(&fc->killsb); list_add_tail(&fm->fc_entry, &fc->mounts); up_write(&fc->killsb); sb->s_flags |= SB_ACTIVE; fsc->root = dget(sb->s_root); return 0; } static const struct fs_context_operations fuse_context_submount_ops = { .get_tree = fuse_get_tree_submount, }; int fuse_init_fs_context_submount(struct fs_context *fsc) { fsc->ops = &fuse_context_submount_ops; return 0; } EXPORT_SYMBOL_GPL(fuse_init_fs_context_submount); int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) { struct fuse_dev *fud = NULL; struct fuse_mount *fm = get_fuse_mount_super(sb); struct fuse_conn *fc = fm->fc; struct inode *root; struct dentry *root_dentry; int err; err = -EINVAL; if (sb->s_flags & SB_MANDLOCK) goto err; rcu_assign_pointer(fc->curr_bucket, fuse_sync_bucket_alloc()); fuse_sb_defaults(sb); if (ctx->is_bdev) { #ifdef CONFIG_BLOCK err = -EINVAL; if (!sb_set_blocksize(sb, ctx->blksize)) goto err; #endif } else { sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; } sb->s_subtype = ctx->subtype; ctx->subtype = NULL; if (IS_ENABLED(CONFIG_FUSE_DAX)) { err = fuse_dax_conn_alloc(fc, ctx->dax_mode, ctx->dax_dev); if (err) goto err; } if (ctx->fudptr) { err = -ENOMEM; fud = fuse_dev_alloc_install(fc); if (!fud) goto err_free_dax; } fc->dev = sb->s_dev; fm->sb = sb; err = fuse_bdi_init(fc, sb); if (err) goto err_dev_free; /* Handle umasking inside the fuse code */ if (sb->s_flags & SB_POSIXACL) fc->dont_mask = 1; sb->s_flags |= SB_POSIXACL; fc->default_permissions = ctx->default_permissions; fc->allow_other = ctx->allow_other; fc->user_id = ctx->user_id; fc->group_id = ctx->group_id; fc->legacy_opts_show = ctx->legacy_opts_show; fc->max_read = max_t(unsigned int, 4096, ctx->max_read); fc->destroy = ctx->destroy; fc->no_control = ctx->no_control; fc->no_force_umount = ctx->no_force_umount; err = -ENOMEM; root = fuse_get_root_inode(sb, ctx->rootmode); sb->s_d_op = &fuse_root_dentry_operations; root_dentry = d_make_root(root); if (!root_dentry) goto err_dev_free; /* Root dentry doesn't have .d_revalidate */ sb->s_d_op = &fuse_dentry_operations; mutex_lock(&fuse_mutex); err = -EINVAL; if (ctx->fudptr && *ctx->fudptr) goto err_unlock; err = fuse_ctl_add_conn(fc); if (err) goto err_unlock; list_add_tail(&fc->entry, &fuse_conn_list); sb->s_root = root_dentry; if (ctx->fudptr) *ctx->fudptr = fud; mutex_unlock(&fuse_mutex); return 0; err_unlock: mutex_unlock(&fuse_mutex); dput(root_dentry); err_dev_free: if (fud) fuse_dev_free(fud); err_free_dax: if (IS_ENABLED(CONFIG_FUSE_DAX)) fuse_dax_conn_free(fc); err: return err; } EXPORT_SYMBOL_GPL(fuse_fill_super_common); static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) { struct fuse_fs_context *ctx = fsc->fs_private; int err; if (!ctx->file || !ctx->rootmode_present || !ctx->user_id_present || !ctx->group_id_present) return -EINVAL; /* * Require mount to happen from the same user namespace which * opened /dev/fuse to prevent potential attacks. */ if ((ctx->file->f_op != &fuse_dev_operations) || (ctx->file->f_cred->user_ns != sb->s_user_ns)) return -EINVAL; ctx->fudptr = &ctx->file->private_data; err = fuse_fill_super_common(sb, ctx); if (err) return err; /* file->private_data shall be visible on all CPUs after this */ smp_mb(); fuse_send_init(get_fuse_mount_super(sb)); return 0; } /* * This is the path where user supplied an already initialized fuse dev. In * this case never create a new super if the old one is gone. */ static int fuse_set_no_super(struct super_block *sb, struct fs_context *fsc) { return -ENOTCONN; } static int fuse_test_super(struct super_block *sb, struct fs_context *fsc) { return fsc->sget_key == get_fuse_conn_super(sb); } static int fuse_get_tree(struct fs_context *fsc) { struct fuse_fs_context *ctx = fsc->fs_private; struct fuse_dev *fud; struct fuse_conn *fc; struct fuse_mount *fm; struct super_block *sb; int err; fc = kmalloc(sizeof(*fc), GFP_KERNEL); if (!fc) return -ENOMEM; fm = kzalloc(sizeof(*fm), GFP_KERNEL); if (!fm) { kfree(fc); return -ENOMEM; } fuse_conn_init(fc, fm, fsc->user_ns, &fuse_dev_fiq_ops, NULL); fc->release = fuse_free_conn; fsc->s_fs_info = fm; if (ctx->fd_present) ctx->file = fget(ctx->fd); if (IS_ENABLED(CONFIG_BLOCK) && ctx->is_bdev) { err = get_tree_bdev(fsc, fuse_fill_super); goto out; } /* * While block dev mount can be initialized with a dummy device fd * (found by device name), normal fuse mounts can't */ err = -EINVAL; if (!ctx->file) goto out; /* * Allow creating a fuse mount with an already initialized fuse * connection */ fud = READ_ONCE(ctx->file->private_data); if (ctx->file->f_op == &fuse_dev_operations && fud) { fsc->sget_key = fud->fc; sb = sget_fc(fsc, fuse_test_super, fuse_set_no_super); err = PTR_ERR_OR_ZERO(sb); if (!IS_ERR(sb)) fsc->root = dget(sb->s_root); } else { err = get_tree_nodev(fsc, fuse_fill_super); } out: if (fsc->s_fs_info) fuse_mount_destroy(fm); if (ctx->file) fput(ctx->file); return err; } static const struct fs_context_operations fuse_context_ops = { .free = fuse_free_fsc, .parse_param = fuse_parse_param, .reconfigure = fuse_reconfigure, .get_tree = fuse_get_tree, }; /* * Set up the filesystem mount context. */ static int fuse_init_fs_context(struct fs_context *fsc) { struct fuse_fs_context *ctx; ctx = kzalloc(sizeof(struct fuse_fs_context), GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->max_read = ~0; ctx->blksize = FUSE_DEFAULT_BLKSIZE; ctx->legacy_opts_show = true; #ifdef CONFIG_BLOCK if (fsc->fs_type == &fuseblk_fs_type) { ctx->is_bdev = true; ctx->destroy = true; } #endif fsc->fs_private = ctx; fsc->ops = &fuse_context_ops; return 0; } bool fuse_mount_remove(struct fuse_mount *fm) { struct fuse_conn *fc = fm->fc; bool last = false; down_write(&fc->killsb); list_del_init(&fm->fc_entry); if (list_empty(&fc->mounts)) last = true; up_write(&fc->killsb); return last; } EXPORT_SYMBOL_GPL(fuse_mount_remove); void fuse_conn_destroy(struct fuse_mount *fm) { struct fuse_conn *fc = fm->fc; if (fc->destroy) fuse_send_destroy(fm); fuse_abort_conn(fc); fuse_wait_aborted(fc); if (!list_empty(&fc->entry)) { mutex_lock(&fuse_mutex); list_del(&fc->entry); fuse_ctl_remove_conn(fc); mutex_unlock(&fuse_mutex); } } EXPORT_SYMBOL_GPL(fuse_conn_destroy); static void fuse_sb_destroy(struct super_block *sb) { struct fuse_mount *fm = get_fuse_mount_super(sb); bool last; if (sb->s_root) { last = fuse_mount_remove(fm); if (last) fuse_conn_destroy(fm); } } void fuse_mount_destroy(struct fuse_mount *fm) { fuse_conn_put(fm->fc); kfree_rcu(fm, rcu); } EXPORT_SYMBOL(fuse_mount_destroy); static void fuse_kill_sb_anon(struct super_block *sb) { fuse_sb_destroy(sb); kill_anon_super(sb); fuse_mount_destroy(get_fuse_mount_super(sb)); } static struct file_system_type fuse_fs_type = { .owner = THIS_MODULE, .name = "fuse", .fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT | FS_ALLOW_IDMAP, .init_fs_context = fuse_init_fs_context, .parameters = fuse_fs_parameters, .kill_sb = fuse_kill_sb_anon, }; MODULE_ALIAS_FS("fuse"); #ifdef CONFIG_BLOCK static void fuse_kill_sb_blk(struct super_block *sb) { fuse_sb_destroy(sb); kill_block_super(sb); fuse_mount_destroy(get_fuse_mount_super(sb)); } static struct file_system_type fuseblk_fs_type = { .owner = THIS_MODULE, .name = "fuseblk", .init_fs_context = fuse_init_fs_context, .parameters = fuse_fs_parameters, .kill_sb = fuse_kill_sb_blk, .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("fuseblk"); static inline int register_fuseblk(void) { return register_filesystem(&fuseblk_fs_type); } static inline void unregister_fuseblk(void) { unregister_filesystem(&fuseblk_fs_type); } #else static inline int register_fuseblk(void) { return 0; } static inline void unregister_fuseblk(void) { } #endif static void fuse_inode_init_once(void *foo) { struct inode *inode = foo; inode_init_once(inode); } static int __init fuse_fs_init(void) { int err; fuse_inode_cachep = kmem_cache_create("fuse_inode", sizeof(struct fuse_inode), 0, SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT, fuse_inode_init_once); err = -ENOMEM; if (!fuse_inode_cachep) goto out; err = register_fuseblk(); if (err) goto out2; err = register_filesystem(&fuse_fs_type); if (err) goto out3; err = fuse_sysctl_register(); if (err) goto out4; return 0; out4: unregister_filesystem(&fuse_fs_type); out3: unregister_fuseblk(); out2: kmem_cache_destroy(fuse_inode_cachep); out: return err; } static void fuse_fs_cleanup(void) { fuse_sysctl_unregister(); unregister_filesystem(&fuse_fs_type); unregister_fuseblk(); /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(fuse_inode_cachep); } static struct kobject *fuse_kobj; static int fuse_sysfs_init(void) { int err; fuse_kobj = kobject_create_and_add("fuse", fs_kobj); if (!fuse_kobj) { err = -ENOMEM; goto out_err; } err = sysfs_create_mount_point(fuse_kobj, "connections"); if (err) goto out_fuse_unregister; return 0; out_fuse_unregister: kobject_put(fuse_kobj); out_err: return err; } static void fuse_sysfs_cleanup(void) { sysfs_remove_mount_point(fuse_kobj, "connections"); kobject_put(fuse_kobj); } static int __init fuse_init(void) { int res; pr_info("init (API version %i.%i)\n", FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION); INIT_LIST_HEAD(&fuse_conn_list); res = fuse_fs_init(); if (res) goto err; res = fuse_dev_init(); if (res) goto err_fs_cleanup; res = fuse_sysfs_init(); if (res) goto err_dev_cleanup; res = fuse_ctl_init(); if (res) goto err_sysfs_cleanup; sanitize_global_limit(&max_user_bgreq); sanitize_global_limit(&max_user_congthresh); return 0; err_sysfs_cleanup: fuse_sysfs_cleanup(); err_dev_cleanup: fuse_dev_cleanup(); err_fs_cleanup: fuse_fs_cleanup(); err: return res; } static void __exit fuse_exit(void) { pr_debug("exit\n"); fuse_ctl_cleanup(); fuse_sysfs_cleanup(); fuse_fs_cleanup(); fuse_dev_cleanup(); } module_init(fuse_init); module_exit(fuse_exit); |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_XATTR_H #define _BCACHEFS_XATTR_H #include "str_hash.h" extern const struct bch_hash_desc bch2_xattr_hash_desc; int bch2_xattr_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_xattr ((struct bkey_ops) { \ .key_validate = bch2_xattr_validate, \ .val_to_text = bch2_xattr_to_text, \ .min_val_size = 8, \ }) static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len) { return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) + name_len + val_len, sizeof(u64)); } #define xattr_val(_xattr) \ ((void *) (_xattr)->x_name + (_xattr)->x_name_len) struct xattr_search_key { u8 type; struct qstr name; }; #define X_SEARCH(_type, _name, _len) ((struct xattr_search_key) \ { .type = _type, .name = QSTR_INIT(_name, _len) }) struct dentry; struct xattr_handler; struct bch_hash_info; struct bch_inode_info; /* Exported for cmd_migrate.c in tools: */ int bch2_xattr_set(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, const struct bch_hash_info *, const char *, const void *, size_t, int, int); ssize_t bch2_xattr_list(struct dentry *, char *, size_t); extern const struct xattr_handler *bch2_xattr_handlers[]; #endif /* _BCACHEFS_XATTR_H */ |
| 114 113 114 114 114 114 114 114 114 114 114 114 2 114 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2022 Oracle. All Rights Reserved. * Author: Allison Henderson <allison.henderson@oracle.com> */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_shared.h" #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_log_format.h" #include "xfs_trans.h" #include "xfs_bmap_btree.h" #include "xfs_trans_priv.h" #include "xfs_log.h" #include "xfs_inode.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_attr_item.h" #include "xfs_trace.h" #include "xfs_trans_space.h" #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" #include "xfs_parent.h" struct kmem_cache *xfs_attri_cache; struct kmem_cache *xfs_attrd_cache; static const struct xfs_item_ops xfs_attri_item_ops; static const struct xfs_item_ops xfs_attrd_item_ops; static inline struct xfs_attri_log_item *ATTRI_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_attri_log_item, attri_item); } /* * Shared xattr name/value buffers for logged extended attribute operations * * When logging updates to extended attributes, we can create quite a few * attribute log intent items for a single xattr update. To avoid cycling the * memory allocator and memcpy overhead, the name (and value, for setxattr) * are kept in a refcounted object that is shared across all related log items * and the upper-level deferred work state structure. The shared buffer has * a control structure, followed by the name, and then the value. */ static inline struct xfs_attri_log_nameval * xfs_attri_log_nameval_get( struct xfs_attri_log_nameval *nv) { if (!refcount_inc_not_zero(&nv->refcount)) return NULL; return nv; } static inline void xfs_attri_log_nameval_put( struct xfs_attri_log_nameval *nv) { if (!nv) return; if (refcount_dec_and_test(&nv->refcount)) kvfree(nv); } static inline struct xfs_attri_log_nameval * xfs_attri_log_nameval_alloc( const void *name, unsigned int name_len, const void *new_name, unsigned int new_name_len, const void *value, unsigned int value_len, const void *new_value, unsigned int new_value_len) { struct xfs_attri_log_nameval *nv; /* * This could be over 64kB in length, so we have to use kvmalloc() for * this. But kvmalloc() utterly sucks, so we use our own version. */ nv = xlog_kvmalloc(sizeof(struct xfs_attri_log_nameval) + name_len + new_name_len + value_len + new_value_len); nv->name.i_addr = nv + 1; nv->name.i_len = name_len; nv->name.i_type = XLOG_REG_TYPE_ATTR_NAME; memcpy(nv->name.i_addr, name, name_len); if (new_name_len) { nv->new_name.i_addr = nv->name.i_addr + name_len; nv->new_name.i_len = new_name_len; memcpy(nv->new_name.i_addr, new_name, new_name_len); } else { nv->new_name.i_addr = NULL; nv->new_name.i_len = 0; } nv->new_name.i_type = XLOG_REG_TYPE_ATTR_NEWNAME; if (value_len) { nv->value.i_addr = nv->name.i_addr + name_len + new_name_len; nv->value.i_len = value_len; memcpy(nv->value.i_addr, value, value_len); } else { nv->value.i_addr = NULL; nv->value.i_len = 0; } nv->value.i_type = XLOG_REG_TYPE_ATTR_VALUE; if (new_value_len) { nv->new_value.i_addr = nv->name.i_addr + name_len + new_name_len + value_len; nv->new_value.i_len = new_value_len; memcpy(nv->new_value.i_addr, new_value, new_value_len); } else { nv->new_value.i_addr = NULL; nv->new_value.i_len = 0; } nv->new_value.i_type = XLOG_REG_TYPE_ATTR_NEWVALUE; refcount_set(&nv->refcount, 1); return nv; } STATIC void xfs_attri_item_free( struct xfs_attri_log_item *attrip) { kvfree(attrip->attri_item.li_lv_shadow); xfs_attri_log_nameval_put(attrip->attri_nameval); kmem_cache_free(xfs_attri_cache, attrip); } /* * Freeing the attrip requires that we remove it from the AIL if it has already * been placed there. However, the ATTRI may not yet have been placed in the * AIL when called by xfs_attri_release() from ATTRD processing due to the * ordering of committed vs unpin operations in bulk insert operations. Hence * the reference count to ensure only the last caller frees the ATTRI. */ STATIC void xfs_attri_release( struct xfs_attri_log_item *attrip) { ASSERT(atomic_read(&attrip->attri_refcount) > 0); if (!atomic_dec_and_test(&attrip->attri_refcount)) return; xfs_trans_ail_delete(&attrip->attri_item, 0); xfs_attri_item_free(attrip); } STATIC void xfs_attri_item_size( struct xfs_log_item *lip, int *nvecs, int *nbytes) { struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); struct xfs_attri_log_nameval *nv = attrip->attri_nameval; *nvecs += 2; *nbytes += sizeof(struct xfs_attri_log_format) + xlog_calc_iovec_len(nv->name.i_len); if (nv->new_name.i_len) { *nvecs += 1; *nbytes += xlog_calc_iovec_len(nv->new_name.i_len); } if (nv->value.i_len) { *nvecs += 1; *nbytes += xlog_calc_iovec_len(nv->value.i_len); } if (nv->new_value.i_len) { *nvecs += 1; *nbytes += xlog_calc_iovec_len(nv->new_value.i_len); } } /* * This is called to fill in the log iovecs for the given attri log * item. We use 1 iovec for the attri_format_item, 1 for the name, and * another for the value if it is present */ STATIC void xfs_attri_item_format( struct xfs_log_item *lip, struct xfs_log_vec *lv) { struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); struct xfs_log_iovec *vecp = NULL; struct xfs_attri_log_nameval *nv = attrip->attri_nameval; attrip->attri_format.alfi_type = XFS_LI_ATTRI; attrip->attri_format.alfi_size = 1; /* * This size accounting must be done before copying the attrip into the * iovec. If we do it after, the wrong size will be recorded to the log * and we trip across assertion checks for bad region sizes later during * the log recovery. */ ASSERT(nv->name.i_len > 0); attrip->attri_format.alfi_size++; if (nv->new_name.i_len > 0) attrip->attri_format.alfi_size++; if (nv->value.i_len > 0) attrip->attri_format.alfi_size++; if (nv->new_value.i_len > 0) attrip->attri_format.alfi_size++; xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRI_FORMAT, &attrip->attri_format, sizeof(struct xfs_attri_log_format)); xlog_copy_from_iovec(lv, &vecp, &nv->name); if (nv->new_name.i_len > 0) xlog_copy_from_iovec(lv, &vecp, &nv->new_name); if (nv->value.i_len > 0) xlog_copy_from_iovec(lv, &vecp, &nv->value); if (nv->new_value.i_len > 0) xlog_copy_from_iovec(lv, &vecp, &nv->new_value); } /* * The unpin operation is the last place an ATTRI is manipulated in the log. It * is either inserted in the AIL or aborted in the event of a log I/O error. In * either case, the ATTRI transaction has been successfully committed to make * it this far. Therefore, we expect whoever committed the ATTRI to either * construct and commit the ATTRD or drop the ATTRD's reference in the event of * error. Simply drop the log's ATTRI reference now that the log is done with * it. */ STATIC void xfs_attri_item_unpin( struct xfs_log_item *lip, int remove) { xfs_attri_release(ATTRI_ITEM(lip)); } STATIC void xfs_attri_item_release( struct xfs_log_item *lip) { xfs_attri_release(ATTRI_ITEM(lip)); } /* * Allocate and initialize an attri item. Caller may allocate an additional * trailing buffer for name and value */ STATIC struct xfs_attri_log_item * xfs_attri_init( struct xfs_mount *mp, struct xfs_attri_log_nameval *nv) { struct xfs_attri_log_item *attrip; attrip = kmem_cache_zalloc(xfs_attri_cache, GFP_KERNEL | __GFP_NOFAIL); /* * Grab an extra reference to the name/value buffer for this log item. * The caller retains its own reference! */ attrip->attri_nameval = xfs_attri_log_nameval_get(nv); ASSERT(attrip->attri_nameval); xfs_log_item_init(mp, &attrip->attri_item, XFS_LI_ATTRI, &xfs_attri_item_ops); attrip->attri_format.alfi_id = (uintptr_t)(void *)attrip; atomic_set(&attrip->attri_refcount, 2); return attrip; } static inline struct xfs_attrd_log_item *ATTRD_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_attrd_log_item, attrd_item); } STATIC void xfs_attrd_item_free(struct xfs_attrd_log_item *attrdp) { kvfree(attrdp->attrd_item.li_lv_shadow); kmem_cache_free(xfs_attrd_cache, attrdp); } STATIC void xfs_attrd_item_size( struct xfs_log_item *lip, int *nvecs, int *nbytes) { *nvecs += 1; *nbytes += sizeof(struct xfs_attrd_log_format); } /* * This is called to fill in the log iovecs for the given attrd log item. We use * only 1 iovec for the attrd_format, and we point that at the attr_log_format * structure embedded in the attrd item. */ STATIC void xfs_attrd_item_format( struct xfs_log_item *lip, struct xfs_log_vec *lv) { struct xfs_attrd_log_item *attrdp = ATTRD_ITEM(lip); struct xfs_log_iovec *vecp = NULL; attrdp->attrd_format.alfd_type = XFS_LI_ATTRD; attrdp->attrd_format.alfd_size = 1; xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRD_FORMAT, &attrdp->attrd_format, sizeof(struct xfs_attrd_log_format)); } /* * The ATTRD is either committed or aborted if the transaction is canceled. If * the transaction is canceled, drop our reference to the ATTRI and free the * ATTRD. */ STATIC void xfs_attrd_item_release( struct xfs_log_item *lip) { struct xfs_attrd_log_item *attrdp = ATTRD_ITEM(lip); xfs_attri_release(attrdp->attrd_attrip); xfs_attrd_item_free(attrdp); } static struct xfs_log_item * xfs_attrd_item_intent( struct xfs_log_item *lip) { return &ATTRD_ITEM(lip)->attrd_attrip->attri_item; } static inline unsigned int xfs_attr_log_item_op(const struct xfs_attri_log_format *attrp) { return attrp->alfi_op_flags & XFS_ATTRI_OP_FLAGS_TYPE_MASK; } /* Log an attr to the intent item. */ STATIC void xfs_attr_log_item( struct xfs_trans *tp, struct xfs_attri_log_item *attrip, const struct xfs_attr_intent *attr) { struct xfs_attri_log_format *attrp; struct xfs_attri_log_nameval *nv = attr->xattri_nameval; struct xfs_da_args *args = attr->xattri_da_args; /* * At this point the xfs_attr_intent has been constructed, and we've * created the log intent. Fill in the attri log item and log format * structure with fields from this xfs_attr_intent */ attrp = &attrip->attri_format; attrp->alfi_ino = args->dp->i_ino; ASSERT(!(attr->xattri_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK)); attrp->alfi_op_flags = attr->xattri_op_flags; attrp->alfi_value_len = nv->value.i_len; switch (xfs_attr_log_item_op(attrp)) { case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE: ASSERT(nv->value.i_len == nv->new_value.i_len); attrp->alfi_igen = VFS_I(args->dp)->i_generation; attrp->alfi_old_name_len = nv->name.i_len; attrp->alfi_new_name_len = nv->new_name.i_len; break; case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE: case XFS_ATTRI_OP_FLAGS_PPTR_SET: attrp->alfi_igen = VFS_I(args->dp)->i_generation; fallthrough; default: attrp->alfi_name_len = nv->name.i_len; break; } ASSERT(!(args->attr_filter & ~XFS_ATTRI_FILTER_MASK)); attrp->alfi_attr_filter = args->attr_filter; } /* Get an ATTRI. */ static struct xfs_log_item * xfs_attr_create_intent( struct xfs_trans *tp, struct list_head *items, unsigned int count, bool sort) { struct xfs_mount *mp = tp->t_mountp; struct xfs_attri_log_item *attrip; struct xfs_attr_intent *attr; struct xfs_da_args *args; ASSERT(count == 1); /* * Each attr item only performs one attribute operation at a time, so * this is a list of one */ attr = list_first_entry_or_null(items, struct xfs_attr_intent, xattri_list); args = attr->xattri_da_args; if (!(args->op_flags & XFS_DA_OP_LOGGED)) return NULL; /* * Create a buffer to store the attribute name and value. This buffer * will be shared between the higher level deferred xattr work state * and the lower level xattr log items. */ if (!attr->xattri_nameval) { /* * Transfer our reference to the name/value buffer to the * deferred work state structure. */ attr->xattri_nameval = xfs_attri_log_nameval_alloc( args->name, args->namelen, args->new_name, args->new_namelen, args->value, args->valuelen, args->new_value, args->new_valuelen); } attrip = xfs_attri_init(mp, attr->xattri_nameval); xfs_attr_log_item(tp, attrip, attr); return &attrip->attri_item; } static inline void xfs_attr_free_item( struct xfs_attr_intent *attr) { if (attr->xattri_da_state) xfs_da_state_free(attr->xattri_da_state); xfs_attri_log_nameval_put(attr->xattri_nameval); if (attr->xattri_da_args->op_flags & XFS_DA_OP_RECOVERY) kfree(attr); else kmem_cache_free(xfs_attr_intent_cache, attr); } static inline struct xfs_attr_intent *attri_entry(const struct list_head *e) { return list_entry(e, struct xfs_attr_intent, xattri_list); } /* Process an attr. */ STATIC int xfs_attr_finish_item( struct xfs_trans *tp, struct xfs_log_item *done, struct list_head *item, struct xfs_btree_cur **state) { struct xfs_attr_intent *attr = attri_entry(item); struct xfs_da_args *args; int error; args = attr->xattri_da_args; /* Reset trans after EAGAIN cycle since the transaction is new */ args->trans = tp; if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) { error = -EIO; goto out; } /* If an attr removal is trivially complete, we're done. */ if (attr->xattri_op_flags == XFS_ATTRI_OP_FLAGS_REMOVE && !xfs_inode_hasattr(args->dp)) { error = 0; goto out; } error = xfs_attr_set_iter(attr); if (!error && attr->xattri_dela_state != XFS_DAS_DONE) return -EAGAIN; out: xfs_attr_free_item(attr); return error; } /* Abort all pending ATTRs. */ STATIC void xfs_attr_abort_intent( struct xfs_log_item *intent) { xfs_attri_release(ATTRI_ITEM(intent)); } /* Cancel an attr */ STATIC void xfs_attr_cancel_item( struct list_head *item) { struct xfs_attr_intent *attr = attri_entry(item); xfs_attr_free_item(attr); } STATIC bool xfs_attri_item_match( struct xfs_log_item *lip, uint64_t intent_id) { return ATTRI_ITEM(lip)->attri_format.alfi_id == intent_id; } static inline bool xfs_attri_validate_namelen(unsigned int namelen) { return namelen > 0 && namelen <= XATTR_NAME_MAX; } /* Is this recovered ATTRI format ok? */ static inline bool xfs_attri_validate( struct xfs_mount *mp, struct xfs_attri_log_format *attrp) { unsigned int op = xfs_attr_log_item_op(attrp); if (attrp->alfi_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK) return false; if (attrp->alfi_attr_filter & ~XFS_ATTRI_FILTER_MASK) return false; if (!xfs_attr_check_namespace(attrp->alfi_attr_filter & XFS_ATTR_NSP_ONDISK_MASK)) return false; switch (op) { case XFS_ATTRI_OP_FLAGS_PPTR_SET: case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE: if (!xfs_has_parent(mp)) return false; if (attrp->alfi_value_len != sizeof(struct xfs_parent_rec)) return false; if (!xfs_attri_validate_namelen(attrp->alfi_name_len)) return false; if (!(attrp->alfi_attr_filter & XFS_ATTR_PARENT)) return false; break; case XFS_ATTRI_OP_FLAGS_SET: case XFS_ATTRI_OP_FLAGS_REPLACE: if (!xfs_is_using_logged_xattrs(mp)) return false; if (attrp->alfi_value_len > XATTR_SIZE_MAX) return false; if (!xfs_attri_validate_namelen(attrp->alfi_name_len)) return false; break; case XFS_ATTRI_OP_FLAGS_REMOVE: if (!xfs_is_using_logged_xattrs(mp)) return false; if (attrp->alfi_value_len != 0) return false; if (!xfs_attri_validate_namelen(attrp->alfi_name_len)) return false; break; case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE: if (!xfs_has_parent(mp)) return false; if (!xfs_attri_validate_namelen(attrp->alfi_old_name_len)) return false; if (!xfs_attri_validate_namelen(attrp->alfi_new_name_len)) return false; if (attrp->alfi_value_len != sizeof(struct xfs_parent_rec)) return false; if (!(attrp->alfi_attr_filter & XFS_ATTR_PARENT)) return false; break; default: return false; } return xfs_verify_ino(mp, attrp->alfi_ino); } static int xfs_attri_iread_extents( struct xfs_inode *ip) { struct xfs_trans *tp; int error; error = xfs_trans_alloc_empty(ip->i_mount, &tp); if (error) return error; xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_iread_extents(tp, ip, XFS_ATTR_FORK); xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_trans_cancel(tp); return error; } static inline struct xfs_attr_intent * xfs_attri_recover_work( struct xfs_mount *mp, struct xfs_defer_pending *dfp, struct xfs_attri_log_format *attrp, struct xfs_inode **ipp, struct xfs_attri_log_nameval *nv) { struct xfs_attr_intent *attr; struct xfs_da_args *args; struct xfs_inode *ip; int local; int error; /* * Parent pointer attr items record the generation but regular logged * xattrs do not; select the right iget function. */ switch (xfs_attr_log_item_op(attrp)) { case XFS_ATTRI_OP_FLAGS_PPTR_SET: case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE: case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE: error = xlog_recover_iget_handle(mp, attrp->alfi_ino, attrp->alfi_igen, &ip); break; default: error = xlog_recover_iget(mp, attrp->alfi_ino, &ip); break; } if (error) { xfs_irele(ip); XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, attrp, sizeof(*attrp)); return ERR_PTR(-EFSCORRUPTED); } if (xfs_inode_has_attr_fork(ip)) { error = xfs_attri_iread_extents(ip); if (error) { xfs_irele(ip); return ERR_PTR(error); } } attr = kzalloc(sizeof(struct xfs_attr_intent) + sizeof(struct xfs_da_args), GFP_KERNEL | __GFP_NOFAIL); args = (struct xfs_da_args *)(attr + 1); attr->xattri_da_args = args; attr->xattri_op_flags = xfs_attr_log_item_op(attrp); /* * We're reconstructing the deferred work state structure from the * recovered log item. Grab a reference to the name/value buffer and * attach it to the new work state. */ attr->xattri_nameval = xfs_attri_log_nameval_get(nv); ASSERT(attr->xattri_nameval); args->dp = ip; args->geo = mp->m_attr_geo; args->whichfork = XFS_ATTR_FORK; args->name = nv->name.i_addr; args->namelen = nv->name.i_len; args->new_name = nv->new_name.i_addr; args->new_namelen = nv->new_name.i_len; args->value = nv->value.i_addr; args->valuelen = nv->value.i_len; args->new_value = nv->new_value.i_addr; args->new_valuelen = nv->new_value.i_len; args->attr_filter = attrp->alfi_attr_filter & XFS_ATTRI_FILTER_MASK; args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT | XFS_DA_OP_LOGGED; args->owner = args->dp->i_ino; xfs_attr_sethash(args); switch (xfs_attr_intent_op(attr)) { case XFS_ATTRI_OP_FLAGS_PPTR_SET: case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE: case XFS_ATTRI_OP_FLAGS_SET: case XFS_ATTRI_OP_FLAGS_REPLACE: args->total = xfs_attr_calc_size(args, &local); if (xfs_inode_hasattr(args->dp)) attr->xattri_dela_state = xfs_attr_init_replace_state(args); else attr->xattri_dela_state = xfs_attr_init_add_state(args); break; case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE: case XFS_ATTRI_OP_FLAGS_REMOVE: attr->xattri_dela_state = xfs_attr_init_remove_state(args); break; } xfs_defer_add_item(dfp, &attr->xattri_list); *ipp = ip; return attr; } /* * Process an attr intent item that was recovered from the log. We need to * delete the attr that it describes. */ STATIC int xfs_attr_recover_work( struct xfs_defer_pending *dfp, struct list_head *capture_list) { struct xfs_log_item *lip = dfp->dfp_intent; struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); struct xfs_attr_intent *attr; struct xfs_mount *mp = lip->li_log->l_mp; struct xfs_inode *ip; struct xfs_da_args *args; struct xfs_trans *tp; struct xfs_trans_res resv; struct xfs_attri_log_format *attrp; struct xfs_attri_log_nameval *nv = attrip->attri_nameval; int error; unsigned int total = 0; /* * First check the validity of the attr described by the ATTRI. If any * are bad, then assume that all are bad and just toss the ATTRI. */ attrp = &attrip->attri_format; if (!xfs_attri_validate(mp, attrp) || !xfs_attr_namecheck(attrp->alfi_attr_filter, nv->name.i_addr, nv->name.i_len)) return -EFSCORRUPTED; attr = xfs_attri_recover_work(mp, dfp, attrp, &ip, nv); if (IS_ERR(attr)) return PTR_ERR(attr); args = attr->xattri_da_args; switch (xfs_attr_intent_op(attr)) { case XFS_ATTRI_OP_FLAGS_PPTR_SET: case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE: case XFS_ATTRI_OP_FLAGS_SET: case XFS_ATTRI_OP_FLAGS_REPLACE: resv = xfs_attr_set_resv(args); total = args->total; break; case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE: case XFS_ATTRI_OP_FLAGS_REMOVE: resv = M_RES(mp)->tr_attrrm; total = XFS_ATTRRM_SPACE_RES(mp); break; } resv = xlog_recover_resv(&resv); error = xfs_trans_alloc(mp, &resv, total, 0, XFS_TRANS_RESERVE, &tp); if (error) return error; args->trans = tp; xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); error = xlog_recover_finish_intent(tp, dfp); if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, &attrip->attri_format, sizeof(attrip->attri_format)); if (error) goto out_cancel; error = xfs_defer_ops_capture_and_commit(tp, capture_list); out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_irele(ip); return error; out_cancel: xfs_trans_cancel(tp); goto out_unlock; } /* Re-log an intent item to push the log tail forward. */ static struct xfs_log_item * xfs_attr_relog_intent( struct xfs_trans *tp, struct xfs_log_item *intent, struct xfs_log_item *done_item) { struct xfs_attri_log_item *old_attrip; struct xfs_attri_log_item *new_attrip; struct xfs_attri_log_format *new_attrp; struct xfs_attri_log_format *old_attrp; old_attrip = ATTRI_ITEM(intent); old_attrp = &old_attrip->attri_format; /* * Create a new log item that shares the same name/value buffer as the * old log item. */ new_attrip = xfs_attri_init(tp->t_mountp, old_attrip->attri_nameval); new_attrp = &new_attrip->attri_format; new_attrp->alfi_ino = old_attrp->alfi_ino; new_attrp->alfi_igen = old_attrp->alfi_igen; new_attrp->alfi_op_flags = old_attrp->alfi_op_flags; new_attrp->alfi_value_len = old_attrp->alfi_value_len; switch (xfs_attr_log_item_op(old_attrp)) { case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE: new_attrp->alfi_new_name_len = old_attrp->alfi_new_name_len; new_attrp->alfi_old_name_len = old_attrp->alfi_old_name_len; break; default: new_attrp->alfi_name_len = old_attrp->alfi_name_len; break; } new_attrp->alfi_attr_filter = old_attrp->alfi_attr_filter; return &new_attrip->attri_item; } /* Get an ATTRD so we can process all the attrs. */ static struct xfs_log_item * xfs_attr_create_done( struct xfs_trans *tp, struct xfs_log_item *intent, unsigned int count) { struct xfs_attri_log_item *attrip; struct xfs_attrd_log_item *attrdp; attrip = ATTRI_ITEM(intent); attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD, &xfs_attrd_item_ops); attrdp->attrd_attrip = attrip; attrdp->attrd_format.alfd_alf_id = attrip->attri_format.alfi_id; return &attrdp->attrd_item; } void xfs_attr_defer_add( struct xfs_da_args *args, enum xfs_attr_defer_op op) { struct xfs_attr_intent *new; unsigned int log_op = 0; bool is_pptr = args->attr_filter & XFS_ATTR_PARENT; if (is_pptr) { ASSERT(xfs_has_parent(args->dp->i_mount)); ASSERT((args->attr_filter & ~XFS_ATTR_PARENT) == 0); ASSERT(args->op_flags & XFS_DA_OP_LOGGED); ASSERT(args->valuelen == sizeof(struct xfs_parent_rec)); } new = kmem_cache_zalloc(xfs_attr_intent_cache, GFP_NOFS | __GFP_NOFAIL); new->xattri_da_args = args; /* Compute log operation from the higher level op and namespace. */ switch (op) { case XFS_ATTR_DEFER_SET: if (is_pptr) log_op = XFS_ATTRI_OP_FLAGS_PPTR_SET; else log_op = XFS_ATTRI_OP_FLAGS_SET; break; case XFS_ATTR_DEFER_REPLACE: if (is_pptr) log_op = XFS_ATTRI_OP_FLAGS_PPTR_REPLACE; else log_op = XFS_ATTRI_OP_FLAGS_REPLACE; break; case XFS_ATTR_DEFER_REMOVE: if (is_pptr) log_op = XFS_ATTRI_OP_FLAGS_PPTR_REMOVE; else log_op = XFS_ATTRI_OP_FLAGS_REMOVE; break; default: ASSERT(0); break; } new->xattri_op_flags = log_op; /* Set up initial attr operation state. */ switch (log_op) { case XFS_ATTRI_OP_FLAGS_PPTR_SET: case XFS_ATTRI_OP_FLAGS_SET: new->xattri_dela_state = xfs_attr_init_add_state(args); break; case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE: ASSERT(args->new_valuelen == args->valuelen); new->xattri_dela_state = xfs_attr_init_replace_state(args); break; case XFS_ATTRI_OP_FLAGS_REPLACE: new->xattri_dela_state = xfs_attr_init_replace_state(args); break; case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE: case XFS_ATTRI_OP_FLAGS_REMOVE: new->xattri_dela_state = xfs_attr_init_remove_state(args); break; } xfs_defer_add(args->trans, &new->xattri_list, &xfs_attr_defer_type); trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp); } const struct xfs_defer_op_type xfs_attr_defer_type = { .name = "attr", .max_items = 1, .create_intent = xfs_attr_create_intent, .abort_intent = xfs_attr_abort_intent, .create_done = xfs_attr_create_done, .finish_item = xfs_attr_finish_item, .cancel_item = xfs_attr_cancel_item, .recover_work = xfs_attr_recover_work, .relog_intent = xfs_attr_relog_intent, }; static inline void * xfs_attri_validate_name_iovec( struct xfs_mount *mp, struct xfs_attri_log_format *attri_formatp, const struct xfs_log_iovec *iovec, unsigned int name_len) { if (iovec->i_len != xlog_calc_iovec_len(name_len)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, attri_formatp, sizeof(*attri_formatp)); return NULL; } if (!xfs_attr_namecheck(attri_formatp->alfi_attr_filter, iovec->i_addr, name_len)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, attri_formatp, sizeof(*attri_formatp)); XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, iovec->i_addr, iovec->i_len); return NULL; } return iovec->i_addr; } static inline void * xfs_attri_validate_value_iovec( struct xfs_mount *mp, struct xfs_attri_log_format *attri_formatp, const struct xfs_log_iovec *iovec, unsigned int value_len) { if (iovec->i_len != xlog_calc_iovec_len(value_len)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, attri_formatp, sizeof(*attri_formatp)); return NULL; } if ((attri_formatp->alfi_attr_filter & XFS_ATTR_PARENT) && !xfs_parent_valuecheck(mp, iovec->i_addr, value_len)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, attri_formatp, sizeof(*attri_formatp)); XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, iovec->i_addr, iovec->i_len); return NULL; } return iovec->i_addr; } STATIC int xlog_recover_attri_commit_pass2( struct xlog *log, struct list_head *buffer_list, struct xlog_recover_item *item, xfs_lsn_t lsn) { struct xfs_mount *mp = log->l_mp; struct xfs_attri_log_item *attrip; struct xfs_attri_log_format *attri_formatp; struct xfs_attri_log_nameval *nv; const void *attr_name; const void *attr_value = NULL; const void *attr_new_name = NULL; const void *attr_new_value = NULL; size_t len; unsigned int name_len = 0; unsigned int value_len = 0; unsigned int new_name_len = 0; unsigned int new_value_len = 0; unsigned int op, i = 0; /* Validate xfs_attri_log_format before the large memory allocation */ len = sizeof(struct xfs_attri_log_format); if (item->ri_buf[i].i_len != len) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, item->ri_buf[0].i_addr, item->ri_buf[0].i_len); return -EFSCORRUPTED; } attri_formatp = item->ri_buf[i].i_addr; if (!xfs_attri_validate(mp, attri_formatp)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, attri_formatp, len); return -EFSCORRUPTED; } /* Check the number of log iovecs makes sense for the op code. */ op = xfs_attr_log_item_op(attri_formatp); switch (op) { case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE: case XFS_ATTRI_OP_FLAGS_PPTR_SET: /* Log item, attr name, attr value */ if (item->ri_total != 3) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, attri_formatp, len); return -EFSCORRUPTED; } name_len = attri_formatp->alfi_name_len; value_len = attri_formatp->alfi_value_len; break; case XFS_ATTRI_OP_FLAGS_SET: case XFS_ATTRI_OP_FLAGS_REPLACE: /* Log item, attr name, attr value */ if (item->ri_total != 3) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, attri_formatp, len); return -EFSCORRUPTED; } name_len = attri_formatp->alfi_name_len; value_len = attri_formatp->alfi_value_len; break; case XFS_ATTRI_OP_FLAGS_REMOVE: /* Log item, attr name */ if (item->ri_total != 2) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, attri_formatp, len); return -EFSCORRUPTED; } name_len = attri_formatp->alfi_name_len; break; case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE: /* * Log item, attr name, new attr name, attr value, new attr * value */ if (item->ri_total != 5) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, attri_formatp, len); return -EFSCORRUPTED; } name_len = attri_formatp->alfi_old_name_len; new_name_len = attri_formatp->alfi_new_name_len; new_value_len = value_len = attri_formatp->alfi_value_len; break; default: XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, attri_formatp, len); return -EFSCORRUPTED; } i++; /* Validate the attr name */ attr_name = xfs_attri_validate_name_iovec(mp, attri_formatp, &item->ri_buf[i], name_len); if (!attr_name) return -EFSCORRUPTED; i++; /* Validate the new attr name */ if (new_name_len > 0) { attr_new_name = xfs_attri_validate_name_iovec(mp, attri_formatp, &item->ri_buf[i], new_name_len); if (!attr_new_name) return -EFSCORRUPTED; i++; } /* Validate the attr value, if present */ if (value_len != 0) { attr_value = xfs_attri_validate_value_iovec(mp, attri_formatp, &item->ri_buf[i], value_len); if (!attr_value) return -EFSCORRUPTED; i++; } /* Validate the new attr value, if present */ if (new_value_len != 0) { attr_new_value = xfs_attri_validate_value_iovec(mp, attri_formatp, &item->ri_buf[i], new_value_len); if (!attr_new_value) return -EFSCORRUPTED; i++; } /* * Make sure we got the correct number of buffers for the operation * that we just loaded. */ if (i != item->ri_total) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, attri_formatp, len); return -EFSCORRUPTED; } switch (op) { case XFS_ATTRI_OP_FLAGS_REMOVE: /* Regular remove operations operate only on names. */ if (attr_value != NULL || value_len != 0) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, attri_formatp, len); return -EFSCORRUPTED; } fallthrough; case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE: case XFS_ATTRI_OP_FLAGS_PPTR_SET: case XFS_ATTRI_OP_FLAGS_SET: case XFS_ATTRI_OP_FLAGS_REPLACE: /* * Regular xattr set/remove/replace operations require a name * and do not take a newname. Values are optional for set and * replace. * * Name-value set/remove operations must have a name, do not * take a newname, and can take a value. */ if (attr_name == NULL || name_len == 0) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, attri_formatp, len); return -EFSCORRUPTED; } break; case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE: /* * Name-value replace operations require the caller to * specify the old and new names and values explicitly. * Values are optional. */ if (attr_name == NULL || name_len == 0) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, attri_formatp, len); return -EFSCORRUPTED; } if (attr_new_name == NULL || new_name_len == 0) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, attri_formatp, len); return -EFSCORRUPTED; } break; } /* * Memory alloc failure will cause replay to abort. We attach the * name/value buffer to the recovered incore log item and drop our * reference. */ nv = xfs_attri_log_nameval_alloc(attr_name, name_len, attr_new_name, new_name_len, attr_value, value_len, attr_new_value, new_value_len); attrip = xfs_attri_init(mp, nv); memcpy(&attrip->attri_format, attri_formatp, len); xlog_recover_intent_item(log, &attrip->attri_item, lsn, &xfs_attr_defer_type); xfs_attri_log_nameval_put(nv); return 0; } /* * This routine is called when an ATTRD format structure is found in a committed * transaction in the log. Its purpose is to cancel the corresponding ATTRI if * it was still in the log. To do this it searches the AIL for the ATTRI with * an id equal to that in the ATTRD format structure. If we find it we drop * the ATTRD reference, which removes the ATTRI from the AIL and frees it. */ STATIC int xlog_recover_attrd_commit_pass2( struct xlog *log, struct list_head *buffer_list, struct xlog_recover_item *item, xfs_lsn_t lsn) { struct xfs_attrd_log_format *attrd_formatp; attrd_formatp = item->ri_buf[0].i_addr; if (item->ri_buf[0].i_len != sizeof(struct xfs_attrd_log_format)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, item->ri_buf[0].i_addr, item->ri_buf[0].i_len); return -EFSCORRUPTED; } xlog_recover_release_intent(log, XFS_LI_ATTRI, attrd_formatp->alfd_alf_id); return 0; } static const struct xfs_item_ops xfs_attri_item_ops = { .flags = XFS_ITEM_INTENT, .iop_size = xfs_attri_item_size, .iop_format = xfs_attri_item_format, .iop_unpin = xfs_attri_item_unpin, .iop_release = xfs_attri_item_release, .iop_match = xfs_attri_item_match, }; const struct xlog_recover_item_ops xlog_attri_item_ops = { .item_type = XFS_LI_ATTRI, .commit_pass2 = xlog_recover_attri_commit_pass2, }; static const struct xfs_item_ops xfs_attrd_item_ops = { .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED | XFS_ITEM_INTENT_DONE, .iop_size = xfs_attrd_item_size, .iop_format = xfs_attrd_item_format, .iop_release = xfs_attrd_item_release, .iop_intent = xfs_attrd_item_intent, }; const struct xlog_recover_item_ops xlog_attrd_item_ops = { .item_type = XFS_LI_ATTRD, .commit_pass2 = xlog_recover_attrd_commit_pass2, }; |
| 1 1 1 313 314 1 291 291 231 230 290 163 231 227 13 225 168 5 174 1 1 1 1 1 1 61 61 62 3 2 62 62 62 62 62 62 30 214 214 213 214 4 209 209 209 22 208 9 9 3 3 7 9 9 9 317 317 318 318 2 280 295 76 22 314 177 290 312 76 314 45 45 45 45 45 12 13 13 13 13 35 35 35 1 34 311 313 314 165 165 163 44 1 45 213 1 294 294 7 7 7 7 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) International Business Machines Corp., 2000-2005 * Portions Copyright (C) Christoph Hellwig, 2001-2002 */ #include <linux/blkdev.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/bio.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/buffer_head.h> #include <linux/mempool.h> #include <linux/seq_file.h> #include <linux/writeback.h> #include "jfs_incore.h" #include "jfs_superblock.h" #include "jfs_filsys.h" #include "jfs_metapage.h" #include "jfs_txnmgr.h" #include "jfs_debug.h" #ifdef CONFIG_JFS_STATISTICS static struct { uint pagealloc; /* # of page allocations */ uint pagefree; /* # of page frees */ uint lockwait; /* # of sleeping lock_metapage() calls */ } mpStat; #endif #define metapage_locked(mp) test_bit(META_locked, &(mp)->flag) #define trylock_metapage(mp) test_and_set_bit_lock(META_locked, &(mp)->flag) static inline void unlock_metapage(struct metapage *mp) { clear_bit_unlock(META_locked, &mp->flag); wake_up(&mp->wait); } static inline void __lock_metapage(struct metapage *mp) { DECLARE_WAITQUEUE(wait, current); INCREMENT(mpStat.lockwait); add_wait_queue_exclusive(&mp->wait, &wait); do { set_current_state(TASK_UNINTERRUPTIBLE); if (metapage_locked(mp)) { folio_unlock(mp->folio); io_schedule(); folio_lock(mp->folio); } } while (trylock_metapage(mp)); __set_current_state(TASK_RUNNING); remove_wait_queue(&mp->wait, &wait); } /* * Must have mp->folio locked */ static inline void lock_metapage(struct metapage *mp) { if (trylock_metapage(mp)) __lock_metapage(mp); } #define METAPOOL_MIN_PAGES 32 static struct kmem_cache *metapage_cache; static mempool_t *metapage_mempool; #define MPS_PER_PAGE (PAGE_SIZE >> L2PSIZE) #if MPS_PER_PAGE > 1 struct meta_anchor { int mp_count; atomic_t io_count; blk_status_t status; struct metapage *mp[MPS_PER_PAGE]; }; static inline struct metapage *folio_to_mp(struct folio *folio, int offset) { struct meta_anchor *anchor = folio->private; if (!anchor) return NULL; return anchor->mp[offset >> L2PSIZE]; } static inline int insert_metapage(struct folio *folio, struct metapage *mp) { struct meta_anchor *a; int index; int l2mp_blocks; /* log2 blocks per metapage */ a = folio->private; if (!a) { a = kzalloc(sizeof(struct meta_anchor), GFP_NOFS); if (!a) return -ENOMEM; folio_attach_private(folio, a); kmap(&folio->page); } if (mp) { l2mp_blocks = L2PSIZE - folio->mapping->host->i_blkbits; index = (mp->index >> l2mp_blocks) & (MPS_PER_PAGE - 1); a->mp_count++; a->mp[index] = mp; } return 0; } static inline void remove_metapage(struct folio *folio, struct metapage *mp) { struct meta_anchor *a = folio->private; int l2mp_blocks = L2PSIZE - folio->mapping->host->i_blkbits; int index; index = (mp->index >> l2mp_blocks) & (MPS_PER_PAGE - 1); BUG_ON(a->mp[index] != mp); a->mp[index] = NULL; if (--a->mp_count == 0) { kfree(a); folio_detach_private(folio); kunmap(&folio->page); } } static inline void inc_io(struct folio *folio) { struct meta_anchor *anchor = folio->private; atomic_inc(&anchor->io_count); } static inline void dec_io(struct folio *folio, blk_status_t status, void (*handler)(struct folio *, blk_status_t)) { struct meta_anchor *anchor = folio->private; if (anchor->status == BLK_STS_OK) anchor->status = status; if (atomic_dec_and_test(&anchor->io_count)) handler(folio, anchor->status); } #else static inline struct metapage *folio_to_mp(struct folio *folio, int offset) { return folio->private; } static inline int insert_metapage(struct folio *folio, struct metapage *mp) { if (mp) { folio_attach_private(folio, mp); kmap(&folio->page); } return 0; } static inline void remove_metapage(struct folio *folio, struct metapage *mp) { folio_detach_private(folio); kunmap(&folio->page); } #define inc_io(folio) do {} while(0) #define dec_io(folio, status, handler) handler(folio, status) #endif static inline struct metapage *alloc_metapage(gfp_t gfp_mask) { struct metapage *mp = mempool_alloc(metapage_mempool, gfp_mask); if (mp) { mp->lid = 0; mp->lsn = 0; mp->data = NULL; mp->clsn = 0; mp->log = NULL; init_waitqueue_head(&mp->wait); } return mp; } static inline void free_metapage(struct metapage *mp) { mempool_free(mp, metapage_mempool); } int __init metapage_init(void) { /* * Allocate the metapage structures */ metapage_cache = kmem_cache_create("jfs_mp", sizeof(struct metapage), 0, 0, NULL); if (metapage_cache == NULL) return -ENOMEM; metapage_mempool = mempool_create_slab_pool(METAPOOL_MIN_PAGES, metapage_cache); if (metapage_mempool == NULL) { kmem_cache_destroy(metapage_cache); return -ENOMEM; } return 0; } void metapage_exit(void) { mempool_destroy(metapage_mempool); kmem_cache_destroy(metapage_cache); } static inline void drop_metapage(struct folio *folio, struct metapage *mp) { if (mp->count || mp->nohomeok || test_bit(META_dirty, &mp->flag) || test_bit(META_io, &mp->flag)) return; remove_metapage(folio, mp); INCREMENT(mpStat.pagefree); free_metapage(mp); } /* * Metapage address space operations */ static sector_t metapage_get_blocks(struct inode *inode, sector_t lblock, int *len) { int rc = 0; int xflag; s64 xaddr; sector_t file_blocks = (inode->i_size + inode->i_sb->s_blocksize - 1) >> inode->i_blkbits; if (lblock >= file_blocks) return 0; if (lblock + *len > file_blocks) *len = file_blocks - lblock; if (inode->i_ino) { rc = xtLookup(inode, (s64)lblock, *len, &xflag, &xaddr, len, 0); if ((rc == 0) && *len) lblock = (sector_t)xaddr; else lblock = 0; } /* else no mapping */ return lblock; } static void last_read_complete(struct folio *folio, blk_status_t status) { if (status) printk(KERN_ERR "Read error %d at %#llx\n", status, folio_pos(folio)); folio_end_read(folio, status == 0); } static void metapage_read_end_io(struct bio *bio) { struct folio *folio = bio->bi_private; dec_io(folio, bio->bi_status, last_read_complete); bio_put(bio); } static void remove_from_logsync(struct metapage *mp) { struct jfs_log *log = mp->log; unsigned long flags; /* * This can race. Recheck that log hasn't been set to null, and after * acquiring logsync lock, recheck lsn */ if (!log) return; LOGSYNC_LOCK(log, flags); if (mp->lsn) { mp->log = NULL; mp->lsn = 0; mp->clsn = 0; log->count--; list_del(&mp->synclist); } LOGSYNC_UNLOCK(log, flags); } static void last_write_complete(struct folio *folio, blk_status_t status) { struct metapage *mp; unsigned int offset; if (status) { int err = blk_status_to_errno(status); printk(KERN_ERR "metapage_write_end_io: I/O error\n"); mapping_set_error(folio->mapping, err); } for (offset = 0; offset < PAGE_SIZE; offset += PSIZE) { mp = folio_to_mp(folio, offset); if (mp && test_bit(META_io, &mp->flag)) { if (mp->lsn) remove_from_logsync(mp); clear_bit(META_io, &mp->flag); } /* * I'd like to call drop_metapage here, but I don't think it's * safe unless I have the page locked */ } folio_end_writeback(folio); } static void metapage_write_end_io(struct bio *bio) { struct folio *folio = bio->bi_private; BUG_ON(!folio->private); dec_io(folio, bio->bi_status, last_write_complete); bio_put(bio); } static int metapage_write_folio(struct folio *folio, struct writeback_control *wbc, void *unused) { struct bio *bio = NULL; int block_offset; /* block offset of mp within page */ struct inode *inode = folio->mapping->host; int blocks_per_mp = JFS_SBI(inode->i_sb)->nbperpage; int len; int xlen; struct metapage *mp; int redirty = 0; sector_t lblock; int nr_underway = 0; sector_t pblock; sector_t next_block = 0; sector_t page_start; unsigned long bio_bytes = 0; unsigned long bio_offset = 0; int offset; int bad_blocks = 0; page_start = folio_pos(folio) >> inode->i_blkbits; BUG_ON(!folio_test_locked(folio)); BUG_ON(folio_test_writeback(folio)); folio_start_writeback(folio); for (offset = 0; offset < PAGE_SIZE; offset += PSIZE) { mp = folio_to_mp(folio, offset); if (!mp || !test_bit(META_dirty, &mp->flag)) continue; if (mp->nohomeok && !test_bit(META_forcewrite, &mp->flag)) { redirty = 1; /* * Make sure this page isn't blocked indefinitely. * If the journal isn't undergoing I/O, push it */ if (mp->log && !(mp->log->cflag & logGC_PAGEOUT)) jfs_flush_journal(mp->log, 0); continue; } clear_bit(META_dirty, &mp->flag); set_bit(META_io, &mp->flag); block_offset = offset >> inode->i_blkbits; lblock = page_start + block_offset; if (bio) { if (xlen && lblock == next_block) { /* Contiguous, in memory & on disk */ len = min(xlen, blocks_per_mp); xlen -= len; bio_bytes += len << inode->i_blkbits; continue; } /* Not contiguous */ bio_add_folio_nofail(bio, folio, bio_bytes, bio_offset); /* * Increment counter before submitting i/o to keep * count from hitting zero before we're through */ inc_io(folio); if (!bio->bi_iter.bi_size) goto dump_bio; submit_bio(bio); nr_underway++; bio = NULL; } else inc_io(folio); xlen = (folio_size(folio) - offset) >> inode->i_blkbits; pblock = metapage_get_blocks(inode, lblock, &xlen); if (!pblock) { printk(KERN_ERR "JFS: metapage_get_blocks failed\n"); /* * We already called inc_io(), but can't cancel it * with dec_io() until we're done with the page */ bad_blocks++; continue; } len = min(xlen, (int)JFS_SBI(inode->i_sb)->nbperpage); bio = bio_alloc(inode->i_sb->s_bdev, 1, REQ_OP_WRITE, GFP_NOFS); bio->bi_iter.bi_sector = pblock << (inode->i_blkbits - 9); bio->bi_end_io = metapage_write_end_io; bio->bi_private = folio; /* Don't call bio_add_page yet, we may add to this vec */ bio_offset = offset; bio_bytes = len << inode->i_blkbits; xlen -= len; next_block = lblock + len; } if (bio) { bio_add_folio_nofail(bio, folio, bio_bytes, bio_offset); if (!bio->bi_iter.bi_size) goto dump_bio; submit_bio(bio); nr_underway++; } if (redirty) folio_redirty_for_writepage(wbc, folio); folio_unlock(folio); if (bad_blocks) goto err_out; if (nr_underway == 0) folio_end_writeback(folio); return 0; dump_bio: print_hex_dump(KERN_ERR, "JFS: dump of bio: ", DUMP_PREFIX_ADDRESS, 16, 4, bio, sizeof(*bio), 0); bio_put(bio); folio_unlock(folio); dec_io(folio, BLK_STS_OK, last_write_complete); err_out: while (bad_blocks--) dec_io(folio, BLK_STS_OK, last_write_complete); return -EIO; } static int metapage_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct blk_plug plug; int err; blk_start_plug(&plug); err = write_cache_pages(mapping, wbc, metapage_write_folio, NULL); blk_finish_plug(&plug); return err; } static int metapage_read_folio(struct file *fp, struct folio *folio) { struct inode *inode = folio->mapping->host; struct bio *bio = NULL; int block_offset; int blocks_per_page = i_blocks_per_folio(inode, folio); sector_t page_start; /* address of page in fs blocks */ sector_t pblock; int xlen; unsigned int len; int offset; BUG_ON(!folio_test_locked(folio)); page_start = folio_pos(folio) >> inode->i_blkbits; block_offset = 0; while (block_offset < blocks_per_page) { xlen = blocks_per_page - block_offset; pblock = metapage_get_blocks(inode, page_start + block_offset, &xlen); if (pblock) { if (!folio->private) insert_metapage(folio, NULL); inc_io(folio); if (bio) submit_bio(bio); bio = bio_alloc(inode->i_sb->s_bdev, 1, REQ_OP_READ, GFP_NOFS); bio->bi_iter.bi_sector = pblock << (inode->i_blkbits - 9); bio->bi_end_io = metapage_read_end_io; bio->bi_private = folio; len = xlen << inode->i_blkbits; offset = block_offset << inode->i_blkbits; bio_add_folio_nofail(bio, folio, len, offset); block_offset += xlen; } else block_offset++; } if (bio) submit_bio(bio); else folio_unlock(folio); return 0; } static bool metapage_release_folio(struct folio *folio, gfp_t gfp_mask) { struct metapage *mp; bool ret = true; int offset; for (offset = 0; offset < PAGE_SIZE; offset += PSIZE) { mp = folio_to_mp(folio, offset); if (!mp) continue; jfs_info("metapage_release_folio: mp = 0x%p", mp); if (mp->count || mp->nohomeok || test_bit(META_dirty, &mp->flag)) { jfs_info("count = %ld, nohomeok = %d", mp->count, mp->nohomeok); ret = false; continue; } if (mp->lsn) remove_from_logsync(mp); remove_metapage(folio, mp); INCREMENT(mpStat.pagefree); free_metapage(mp); } return ret; } static void metapage_invalidate_folio(struct folio *folio, size_t offset, size_t length) { BUG_ON(offset || length < folio_size(folio)); BUG_ON(folio_test_writeback(folio)); metapage_release_folio(folio, 0); } const struct address_space_operations jfs_metapage_aops = { .read_folio = metapage_read_folio, .writepages = metapage_writepages, .release_folio = metapage_release_folio, .invalidate_folio = metapage_invalidate_folio, .dirty_folio = filemap_dirty_folio, }; struct metapage *__get_metapage(struct inode *inode, unsigned long lblock, unsigned int size, int absolute, unsigned long new) { int l2BlocksPerPage; int l2bsize; struct address_space *mapping; struct metapage *mp = NULL; struct folio *folio; unsigned long page_index; unsigned long page_offset; jfs_info("__get_metapage: ino = %ld, lblock = 0x%lx, abs=%d", inode->i_ino, lblock, absolute); l2bsize = inode->i_blkbits; l2BlocksPerPage = PAGE_SHIFT - l2bsize; page_index = lblock >> l2BlocksPerPage; page_offset = (lblock - (page_index << l2BlocksPerPage)) << l2bsize; if ((page_offset + size) > PAGE_SIZE) { jfs_err("MetaData crosses page boundary!!"); jfs_err("lblock = %lx, size = %d", lblock, size); dump_stack(); return NULL; } if (absolute) mapping = JFS_SBI(inode->i_sb)->direct_inode->i_mapping; else { /* * If an nfs client tries to read an inode that is larger * than any existing inodes, we may try to read past the * end of the inode map */ if ((lblock << inode->i_blkbits) >= inode->i_size) return NULL; mapping = inode->i_mapping; } if (new && (PSIZE == PAGE_SIZE)) { folio = filemap_grab_folio(mapping, page_index); if (IS_ERR(folio)) { jfs_err("filemap_grab_folio failed!"); return NULL; } folio_mark_uptodate(folio); } else { folio = read_mapping_folio(mapping, page_index, NULL); if (IS_ERR(folio)) { jfs_err("read_mapping_page failed!"); return NULL; } folio_lock(folio); } mp = folio_to_mp(folio, page_offset); if (mp) { if (mp->logical_size != size) { jfs_error(inode->i_sb, "get_mp->logical_size != size\n"); jfs_err("logical_size = %d, size = %d", mp->logical_size, size); dump_stack(); goto unlock; } mp->count++; lock_metapage(mp); if (test_bit(META_discard, &mp->flag)) { if (!new) { jfs_error(inode->i_sb, "using a discarded metapage\n"); discard_metapage(mp); goto unlock; } clear_bit(META_discard, &mp->flag); } } else { INCREMENT(mpStat.pagealloc); mp = alloc_metapage(GFP_NOFS); if (!mp) goto unlock; mp->folio = folio; mp->sb = inode->i_sb; mp->flag = 0; mp->xflag = COMMIT_PAGE; mp->count = 1; mp->nohomeok = 0; mp->logical_size = size; mp->data = folio_address(folio) + page_offset; mp->index = lblock; if (unlikely(insert_metapage(folio, mp))) { free_metapage(mp); goto unlock; } lock_metapage(mp); } if (new) { jfs_info("zeroing mp = 0x%p", mp); memset(mp->data, 0, PSIZE); } folio_unlock(folio); jfs_info("__get_metapage: returning = 0x%p data = 0x%p", mp, mp->data); return mp; unlock: folio_unlock(folio); return NULL; } void grab_metapage(struct metapage * mp) { jfs_info("grab_metapage: mp = 0x%p", mp); folio_get(mp->folio); folio_lock(mp->folio); mp->count++; lock_metapage(mp); folio_unlock(mp->folio); } static int metapage_write_one(struct folio *folio) { struct address_space *mapping = folio->mapping; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = folio_nr_pages(folio), }; int ret = 0; BUG_ON(!folio_test_locked(folio)); folio_wait_writeback(folio); if (folio_clear_dirty_for_io(folio)) { folio_get(folio); ret = metapage_write_folio(folio, &wbc, NULL); if (ret == 0) folio_wait_writeback(folio); folio_put(folio); } else { folio_unlock(folio); } if (!ret) ret = filemap_check_errors(mapping); return ret; } void force_metapage(struct metapage *mp) { struct folio *folio = mp->folio; jfs_info("force_metapage: mp = 0x%p", mp); set_bit(META_forcewrite, &mp->flag); clear_bit(META_sync, &mp->flag); folio_get(folio); folio_lock(folio); folio_mark_dirty(folio); if (metapage_write_one(folio)) jfs_error(mp->sb, "metapage_write_one() failed\n"); clear_bit(META_forcewrite, &mp->flag); folio_put(folio); } void hold_metapage(struct metapage *mp) { folio_lock(mp->folio); } void put_metapage(struct metapage *mp) { if (mp->count || mp->nohomeok) { /* Someone else will release this */ folio_unlock(mp->folio); return; } folio_get(mp->folio); mp->count++; lock_metapage(mp); folio_unlock(mp->folio); release_metapage(mp); } void release_metapage(struct metapage * mp) { struct folio *folio = mp->folio; jfs_info("release_metapage: mp = 0x%p, flag = 0x%lx", mp, mp->flag); folio_lock(folio); unlock_metapage(mp); assert(mp->count); if (--mp->count || mp->nohomeok) { folio_unlock(folio); folio_put(folio); return; } if (test_bit(META_dirty, &mp->flag)) { folio_mark_dirty(folio); if (test_bit(META_sync, &mp->flag)) { clear_bit(META_sync, &mp->flag); if (metapage_write_one(folio)) jfs_error(mp->sb, "metapage_write_one() failed\n"); folio_lock(folio); } } else if (mp->lsn) /* discard_metapage doesn't remove it */ remove_from_logsync(mp); /* Try to keep metapages from using up too much memory */ drop_metapage(folio, mp); folio_unlock(folio); folio_put(folio); } void __invalidate_metapages(struct inode *ip, s64 addr, int len) { sector_t lblock; int l2BlocksPerPage = PAGE_SHIFT - ip->i_blkbits; int BlocksPerPage = 1 << l2BlocksPerPage; /* All callers are interested in block device's mapping */ struct address_space *mapping = JFS_SBI(ip->i_sb)->direct_inode->i_mapping; struct metapage *mp; unsigned int offset; /* * Mark metapages to discard. They will eventually be * released, but should not be written. */ for (lblock = addr & ~(BlocksPerPage - 1); lblock < addr + len; lblock += BlocksPerPage) { struct folio *folio = filemap_lock_folio(mapping, lblock >> l2BlocksPerPage); if (IS_ERR(folio)) continue; for (offset = 0; offset < PAGE_SIZE; offset += PSIZE) { mp = folio_to_mp(folio, offset); if (!mp) continue; if (mp->index < addr) continue; if (mp->index >= addr + len) break; clear_bit(META_dirty, &mp->flag); set_bit(META_discard, &mp->flag); if (mp->lsn) remove_from_logsync(mp); } folio_unlock(folio); folio_put(folio); } } #ifdef CONFIG_JFS_STATISTICS int jfs_mpstat_proc_show(struct seq_file *m, void *v) { seq_printf(m, "JFS Metapage statistics\n" "=======================\n" "page allocations = %d\n" "page frees = %d\n" "lock waits = %d\n", mpStat.pagealloc, mpStat.pagefree, mpStat.lockwait); return 0; } #endif |
| 27 24 24 24 24 24 24 24 24 24 24 24 24 25 25 25 1 24 24 24 24 24 24 24 24 24 24 25 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_trace.h" #include "xfs_trans_priv.h" #include "xfs_buf_item.h" #include "xfs_log.h" #include "xfs_error.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" #include "xfs_icache.h" #include "xfs_bmap_btree.h" STATIC void xlog_recover_inode_ra_pass2( struct xlog *log, struct xlog_recover_item *item) { if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { struct xfs_inode_log_format *ilfp = item->ri_buf[0].i_addr; xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len, &xfs_inode_buf_ra_ops); } else { struct xfs_inode_log_format_32 *ilfp = item->ri_buf[0].i_addr; xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len, &xfs_inode_buf_ra_ops); } } /* * Inode fork owner changes * * If we have been told that we have to reparent the inode fork, it's because an * extent swap operation on a CRC enabled filesystem has been done and we are * replaying it. We need to walk the BMBT of the appropriate fork and change the * owners of it. * * The complexity here is that we don't have an inode context to work with, so * after we've replayed the inode we need to instantiate one. This is where the * fun begins. * * We are in the middle of log recovery, so we can't run transactions. That * means we cannot use cache coherent inode instantiation via xfs_iget(), as * that will result in the corresponding iput() running the inode through * xfs_inactive(). If we've just replayed an inode core that changes the link * count to zero (i.e. it's been unlinked), then xfs_inactive() will run * transactions (bad!). * * So, to avoid this, we instantiate an inode directly from the inode core we've * just recovered. We have the buffer still locked, and all we really need to * instantiate is the inode core and the forks being modified. We can do this * manually, then run the inode btree owner change, and then tear down the * xfs_inode without having to run any transactions at all. * * Also, because we don't have a transaction context available here but need to * gather all the buffers we modify for writeback so we pass the buffer_list * instead for the operation to use. */ STATIC int xfs_recover_inode_owner_change( struct xfs_mount *mp, struct xfs_dinode *dip, struct xfs_inode_log_format *in_f, struct list_head *buffer_list) { struct xfs_inode *ip; int error; ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)); ip = xfs_inode_alloc(mp, in_f->ilf_ino); if (!ip) return -ENOMEM; /* instantiate the inode */ ASSERT(dip->di_version >= 3); error = xfs_inode_from_disk(ip, dip); if (error) goto out_free_ip; if (in_f->ilf_fields & XFS_ILOG_DOWNER) { ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT); error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK, ip->i_ino, buffer_list); if (error) goto out_free_ip; } if (in_f->ilf_fields & XFS_ILOG_AOWNER) { ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT); error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK, ip->i_ino, buffer_list); if (error) goto out_free_ip; } out_free_ip: xfs_inode_free(ip); return error; } static inline bool xfs_log_dinode_has_bigtime(const struct xfs_log_dinode *ld) { return ld->di_version >= 3 && (ld->di_flags2 & XFS_DIFLAG2_BIGTIME); } /* Convert a log timestamp to an ondisk timestamp. */ static inline xfs_timestamp_t xfs_log_dinode_to_disk_ts( struct xfs_log_dinode *from, const xfs_log_timestamp_t its) { struct xfs_legacy_timestamp *lts; struct xfs_log_legacy_timestamp *lits; xfs_timestamp_t ts; if (xfs_log_dinode_has_bigtime(from)) return cpu_to_be64(its); lts = (struct xfs_legacy_timestamp *)&ts; lits = (struct xfs_log_legacy_timestamp *)&its; lts->t_sec = cpu_to_be32(lits->t_sec); lts->t_nsec = cpu_to_be32(lits->t_nsec); return ts; } static inline bool xfs_log_dinode_has_large_ex |