| 28 431 19 43 431 431 29 432 66 65 16 66 66 13 13 1 383 203 379 364 66 65 364 66 362 132 1 131 3 127 131 131 362 41 384 6 66 238 237 238 94 88 154 238 8 273 272 183 125 273 38 268 18 273 37 269 109 354 63 356 356 132 2 132 355 273 86 8 86 61 86 356 80 392 187 390 28 28 2 2 390 391 392 392 133 133 7 391 39 385 392 5 392 392 37 386 392 32 73 32 29 390 32 32 30 369 29 4 3 4 391 392 390 392 67 354 354 392 392 391 1 385 378 10 382 358 356 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/attr.c * * Copyright (C) 1991, 1992 Linus Torvalds * changes by Thomas Schoebel-Theuer */ #include <linux/export.h> #include <linux/time.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/sched/signal.h> #include <linux/capability.h> #include <linux/fsnotify.h> #include <linux/fcntl.h> #include <linux/filelock.h> #include <linux/security.h> /** * setattr_should_drop_sgid - determine whether the setgid bit needs to be * removed * @idmap: idmap of the mount @inode was found from * @inode: inode to check * * This function determines whether the setgid bit needs to be removed. * We retain backwards compatibility and require setgid bit to be removed * unconditionally if S_IXGRP is set. Otherwise we have the exact same * requirements as setattr_prepare() and setattr_copy(). * * Return: ATTR_KILL_SGID if setgid bit needs to be removed, 0 otherwise. */ int setattr_should_drop_sgid(struct mnt_idmap *idmap, const struct inode *inode) { umode_t mode = inode->i_mode; if (!(mode & S_ISGID)) return 0; if (mode & S_IXGRP) return ATTR_KILL_SGID; if (!in_group_or_capable(idmap, inode, i_gid_into_vfsgid(idmap, inode))) return ATTR_KILL_SGID; return 0; } EXPORT_SYMBOL(setattr_should_drop_sgid); /** * setattr_should_drop_suidgid - determine whether the set{g,u}id bit needs to * be dropped * @idmap: idmap of the mount @inode was found from * @inode: inode to check * * This function determines whether the set{g,u}id bits need to be removed. * If the setuid bit needs to be removed ATTR_KILL_SUID is returned. If the * setgid bit needs to be removed ATTR_KILL_SGID is returned. If both * set{g,u}id bits need to be removed the corresponding mask of both flags is * returned. * * Return: A mask of ATTR_KILL_S{G,U}ID indicating which - if any - setid bits * to remove, 0 otherwise. */ int setattr_should_drop_suidgid(struct mnt_idmap *idmap, struct inode *inode) { umode_t mode = inode->i_mode; int kill = 0; /* suid always must be killed */ if (unlikely(mode & S_ISUID)) kill = ATTR_KILL_SUID; kill |= setattr_should_drop_sgid(idmap, inode); if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) return kill; return 0; } EXPORT_SYMBOL(setattr_should_drop_suidgid); /** * chown_ok - verify permissions to chown inode * @idmap: idmap of the mount @inode was found from * @inode: inode to check permissions on * @ia_vfsuid: uid to chown @inode to * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply pass @nop_mnt_idmap. */ static bool chown_ok(struct mnt_idmap *idmap, const struct inode *inode, vfsuid_t ia_vfsuid) { vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); if (vfsuid_eq_kuid(vfsuid, current_fsuid()) && vfsuid_eq(ia_vfsuid, vfsuid)) return true; if (capable_wrt_inode_uidgid(idmap, inode, CAP_CHOWN)) return true; if (!vfsuid_valid(vfsuid) && ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN)) return true; return false; } /** * chgrp_ok - verify permissions to chgrp inode * @idmap: idmap of the mount @inode was found from * @inode: inode to check permissions on * @ia_vfsgid: gid to chown @inode to * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply pass @nop_mnt_idmap. */ static bool chgrp_ok(struct mnt_idmap *idmap, const struct inode *inode, vfsgid_t ia_vfsgid) { vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); if (vfsuid_eq_kuid(vfsuid, current_fsuid())) { if (vfsgid_eq(ia_vfsgid, vfsgid)) return true; if (vfsgid_in_group_p(ia_vfsgid)) return true; } if (capable_wrt_inode_uidgid(idmap, inode, CAP_CHOWN)) return true; if (!vfsgid_valid(vfsgid) && ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN)) return true; return false; } /** * setattr_prepare - check if attribute changes to a dentry are allowed * @idmap: idmap of the mount the inode was found from * @dentry: dentry to check * @attr: attributes to change * * Check if we are allowed to change the attributes contained in @attr * in the given dentry. This includes the normal unix access permission * checks, as well as checks for rlimits and others. The function also clears * SGID bit from mode if user is not allowed to set it. Also file capabilities * and IMA extended attributes are cleared if ATTR_KILL_PRIV is set. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply pass @nop_mnt_idmap. * * Should be called as the first thing in ->setattr implementations, * possibly after taking additional locks. */ int setattr_prepare(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); unsigned int ia_valid = attr->ia_valid; /* * First check size constraints. These can't be overriden using * ATTR_FORCE. */ if (ia_valid & ATTR_SIZE) { int error = inode_newsize_ok(inode, attr->ia_size); if (error) return error; } /* If force is set do it anyway. */ if (ia_valid & ATTR_FORCE) goto kill_priv; /* Make sure a caller can chown. */ if ((ia_valid & ATTR_UID) && !chown_ok(idmap, inode, attr->ia_vfsuid)) return -EPERM; /* Make sure caller can chgrp. */ if ((ia_valid & ATTR_GID) && !chgrp_ok(idmap, inode, attr->ia_vfsgid)) return -EPERM; /* Make sure a caller can chmod. */ if (ia_valid & ATTR_MODE) { vfsgid_t vfsgid; if (!inode_owner_or_capable(idmap, inode)) return -EPERM; if (ia_valid & ATTR_GID) vfsgid = attr->ia_vfsgid; else vfsgid = i_gid_into_vfsgid(idmap, inode); /* Also check the setgid bit! */ if (!in_group_or_capable(idmap, inode, vfsgid)) attr->ia_mode &= ~S_ISGID; } /* Check for setting the inode time. */ if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) { if (!inode_owner_or_capable(idmap, inode)) return -EPERM; } kill_priv: /* User has permission for the change */ if (ia_valid & ATTR_KILL_PRIV) { int error; error = security_inode_killpriv(idmap, dentry); if (error) return error; } return 0; } EXPORT_SYMBOL(setattr_prepare); /** * inode_newsize_ok - may this inode be truncated to a given size * @inode: the inode to be truncated * @offset: the new size to assign to the inode * * inode_newsize_ok must be called with i_rwsem held exclusively. * * inode_newsize_ok will check filesystem limits and ulimits to check that the * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ * when necessary. Caller must not proceed with inode size change if failure is * returned. @inode must be a file (not directory), with appropriate * permissions to allow truncate (inode_newsize_ok does NOT check these * conditions). * * Return: 0 on success, -ve errno on failure */ int inode_newsize_ok(const struct inode *inode, loff_t offset) { if (offset < 0) return -EINVAL; if (inode->i_size < offset) { unsigned long limit; limit = rlimit(RLIMIT_FSIZE); if (limit != RLIM_INFINITY && offset > limit) goto out_sig; if (offset > inode->i_sb->s_maxbytes) goto out_big; } else { /* * truncation of in-use swapfiles is disallowed - it would * cause subsequent swapout to scribble on the now-freed * blocks. */ if (IS_SWAPFILE(inode)) return -ETXTBSY; } return 0; out_sig: send_sig(SIGXFSZ, current, 0); out_big: return -EFBIG; } EXPORT_SYMBOL(inode_newsize_ok); /** * setattr_copy_mgtime - update timestamps for mgtime inodes * @inode: inode timestamps to be updated * @attr: attrs for the update * * With multigrain timestamps, take more care to prevent races when * updating the ctime. Always update the ctime to the very latest using * the standard mechanism, and use that to populate the atime and mtime * appropriately (unless those are being set to specific values). */ static void setattr_copy_mgtime(struct inode *inode, const struct iattr *attr) { unsigned int ia_valid = attr->ia_valid; struct timespec64 now; if (ia_valid & ATTR_CTIME_SET) now = inode_set_ctime_deleg(inode, attr->ia_ctime); else if (ia_valid & ATTR_CTIME) now = inode_set_ctime_current(inode); else now = current_time(inode); if (ia_valid & ATTR_ATIME_SET) inode_set_atime_to_ts(inode, attr->ia_atime); else if (ia_valid & ATTR_ATIME) inode_set_atime_to_ts(inode, now); if (ia_valid & ATTR_MTIME_SET) inode_set_mtime_to_ts(inode, attr->ia_mtime); else if (ia_valid & ATTR_MTIME) inode_set_mtime_to_ts(inode, now); } /** * setattr_copy - copy simple metadata updates into the generic inode * @idmap: idmap of the mount the inode was found from * @inode: the inode to be updated * @attr: the new attributes * * setattr_copy must be called with i_rwsem held exclusively. * * setattr_copy updates the inode's metadata with that specified * in attr on idmapped mounts. Necessary permission checks to determine * whether or not the S_ISGID property needs to be removed are performed with * the correct idmapped mount permission helpers. * Noticeably missing is inode size update, which is more complex * as it requires pagecache updates. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply pass @nop_mnt_idmap. * * The inode is not marked as dirty after this operation. The rationale is * that for "simple" filesystems, the struct inode is the inode storage. * The caller is free to mark the inode dirty afterwards if needed. */ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode, const struct iattr *attr) { unsigned int ia_valid = attr->ia_valid; i_uid_update(idmap, attr, inode); i_gid_update(idmap, attr, inode); if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; if (!in_group_or_capable(idmap, inode, i_gid_into_vfsgid(idmap, inode))) mode &= ~S_ISGID; inode->i_mode = mode; } if (is_mgtime(inode)) return setattr_copy_mgtime(inode, attr); if (ia_valid & ATTR_ATIME) inode_set_atime_to_ts(inode, attr->ia_atime); if (ia_valid & ATTR_MTIME) inode_set_mtime_to_ts(inode, attr->ia_mtime); if (ia_valid & ATTR_CTIME_SET) inode_set_ctime_deleg(inode, attr->ia_ctime); else if (ia_valid & ATTR_CTIME) inode_set_ctime_to_ts(inode, attr->ia_ctime); } EXPORT_SYMBOL(setattr_copy); int may_setattr(struct mnt_idmap *idmap, struct inode *inode, unsigned int ia_valid) { int error; if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) { if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EPERM; } /* * If utimes(2) and friends are called with times == NULL (or both * times are UTIME_NOW), then we need to check for write permission */ if (ia_valid & ATTR_TOUCH) { if (IS_IMMUTABLE(inode)) return -EPERM; if (!inode_owner_or_capable(idmap, inode)) { error = inode_permission(idmap, inode, MAY_WRITE); if (error) return error; } } return 0; } EXPORT_SYMBOL(may_setattr); /** * notify_change - modify attributes of a filesystem object * @idmap: idmap of the mount the inode was found from * @dentry: object affected * @attr: new attributes * @delegated_inode: returns inode, if the inode is delegated * * The caller must hold the i_rwsem exclusively on the affected object. * * If notify_change discovers a delegation in need of breaking, * it will return -EWOULDBLOCK and return a reference to the inode in * delegated_inode. The caller should then break the delegation and * retry. Because breaking a delegation may take a long time, the * caller should drop the i_rwsem before doing so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. Also, passing NULL is fine for callers holding * the file open for write, as there can be no conflicting delegation in * that case. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply pass @nop_mnt_idmap. */ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr, struct delegated_inode *delegated_inode) { struct inode *inode = dentry->d_inode; umode_t mode = inode->i_mode; int error; struct timespec64 now; unsigned int ia_valid = attr->ia_valid; WARN_ON_ONCE(!inode_is_locked(inode)); error = may_setattr(idmap, inode, ia_valid); if (error) return error; if ((ia_valid & ATTR_MODE)) { /* * Don't allow changing the mode of symlinks: * * (1) The vfs doesn't take the mode of symlinks into account * during permission checking. * (2) This has never worked correctly. Most major filesystems * did return EOPNOTSUPP due to interactions with POSIX ACLs * but did still updated the mode of the symlink. * This inconsistency led system call wrapper providers such * as libc to block changing the mode of symlinks with * EOPNOTSUPP already. * (3) To even do this in the first place one would have to use * specific file descriptors and quite some effort. */ if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; /* Flag setting protected by i_rwsem */ if (is_sxid(attr->ia_mode)) inode->i_flags &= ~S_NOSEC; } now = current_time(inode); if (ia_valid & ATTR_ATIME_SET) attr->ia_atime = timestamp_truncate(attr->ia_atime, inode); else attr->ia_atime = now; if (ia_valid & ATTR_CTIME_SET) attr->ia_ctime = timestamp_truncate(attr->ia_ctime, inode); else attr->ia_ctime = now; if (ia_valid & ATTR_MTIME_SET) attr->ia_mtime = timestamp_truncate(attr->ia_mtime, inode); else attr->ia_mtime = now; if (ia_valid & ATTR_KILL_PRIV) { error = security_inode_need_killpriv(dentry); if (error < 0) return error; if (error == 0) ia_valid = attr->ia_valid &= ~ATTR_KILL_PRIV; } /* * We now pass ATTR_KILL_S*ID to the lower level setattr function so * that the function has the ability to reinterpret a mode change * that's due to these bits. This adds an implicit restriction that * no function will ever call notify_change with both ATTR_MODE and * ATTR_KILL_S*ID set. */ if ((ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) && (ia_valid & ATTR_MODE)) BUG(); if (ia_valid & ATTR_KILL_SUID) { if (mode & S_ISUID) { ia_valid = attr->ia_valid |= ATTR_MODE; attr->ia_mode = (inode->i_mode & ~S_ISUID); } } if (ia_valid & ATTR_KILL_SGID) { if (mode & S_ISGID) { if (!(ia_valid & ATTR_MODE)) { ia_valid = attr->ia_valid |= ATTR_MODE; attr->ia_mode = inode->i_mode; } attr->ia_mode &= ~S_ISGID; } } if (!(attr->ia_valid & ~(ATTR_KILL_SUID | ATTR_KILL_SGID))) return 0; /* * Verify that uid/gid changes are valid in the target * namespace of the superblock. */ if (ia_valid & ATTR_UID && !vfsuid_has_fsmapping(idmap, inode->i_sb->s_user_ns, attr->ia_vfsuid)) return -EOVERFLOW; if (ia_valid & ATTR_GID && !vfsgid_has_fsmapping(idmap, inode->i_sb->s_user_ns, attr->ia_vfsgid)) return -EOVERFLOW; /* Don't allow modifications of files with invalid uids or * gids unless those uids & gids are being made valid. */ if (!(ia_valid & ATTR_UID) && !vfsuid_valid(i_uid_into_vfsuid(idmap, inode))) return -EOVERFLOW; if (!(ia_valid & ATTR_GID) && !vfsgid_valid(i_gid_into_vfsgid(idmap, inode))) return -EOVERFLOW; error = security_inode_setattr(idmap, dentry, attr); if (error) return error; /* * If ATTR_DELEG is set, then these attributes are being set on * behalf of the holder of a write delegation. We want to avoid * breaking the delegation in this case. */ if (!(ia_valid & ATTR_DELEG)) { error = try_break_deleg(inode, delegated_inode); if (error) return error; } if (inode->i_op->setattr) error = inode->i_op->setattr(idmap, dentry, attr); else error = simple_setattr(idmap, dentry, attr); if (!error) { fsnotify_change(dentry, ia_valid); security_inode_post_setattr(idmap, dentry, ia_valid); } return error; } EXPORT_SYMBOL(notify_change); |
| 236 236 234 236 157 2 157 157 157 1 1 1 1 1 1 1 1 75 75 75 75 75 75 75 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 | // SPDX-License-Identifier: GPL-2.0-or-later /* * ip_vs_proto_tcp.c: TCP load balancing support for IPVS * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * Julian Anastasov <ja@ssi.bg> * * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com> * * Network name space (netns) aware. * Global data moved to netns i.e struct netns_ipvs * tcp_timeouts table has copy per netns in a hash table per * protocol ip_vs_proto_data and is handled by netns */ #define pr_fmt(fmt) "IPVS: " fmt #include <linux/kernel.h> #include <linux/ip.h> #include <linux/tcp.h> /* for tcphdr */ #include <net/ip.h> #include <net/tcp.h> /* for csum_tcpudp_magic */ #include <net/ip6_checksum.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/indirect_call_wrapper.h> #include <net/ip_vs.h> static int tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp); static int tcp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph) { struct ip_vs_service *svc; struct tcphdr _tcph, *th; __be16 _ports[2], *ports = NULL; /* In the event of icmp, we're only guaranteed to have the first 8 * bytes of the transport header, so we only check the rest of the * TCP packet for non-ICMP packets */ if (likely(!ip_vs_iph_icmp(iph))) { th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); if (th) { if (th->rst || !(sysctl_sloppy_tcp(ipvs) || th->syn)) return 1; ports = &th->source; } } else { ports = skb_header_pointer( skb, iph->len, sizeof(_ports), &_ports); } if (!ports) { *verdict = NF_DROP; return 0; } /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */ if (likely(!ip_vs_iph_inverse(iph))) svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, &iph->daddr, ports[1]); else svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, &iph->saddr, ports[0]); if (svc) { int ignored; if (ip_vs_todrop(ipvs)) { /* * It seems that we are very loaded. * We have to drop this packet :( */ *verdict = NF_DROP; return 0; } /* * Let the virtual server select a real server for the * incoming connection, and create a connection entry. */ *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph); if (!*cpp && ignored <= 0) { if (!ignored) *verdict = ip_vs_leave(svc, skb, pd, iph); else *verdict = NF_DROP; return 0; } } /* NF_ACCEPT */ return 1; } static inline void tcp_fast_csum_update(int af, struct tcphdr *tcph, const union nf_inet_addr *oldip, const union nf_inet_addr *newip, __be16 oldport, __be16 newport) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) tcph->check = csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, ip_vs_check_diff2(oldport, newport, ~csum_unfold(tcph->check)))); else #endif tcph->check = csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, ip_vs_check_diff2(oldport, newport, ~csum_unfold(tcph->check)))); } static inline void tcp_partial_csum_update(int af, struct tcphdr *tcph, const union nf_inet_addr *oldip, const union nf_inet_addr *newip, __be16 oldlen, __be16 newlen) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) tcph->check = ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, ip_vs_check_diff2(oldlen, newlen, csum_unfold(tcph->check)))); else #endif tcph->check = ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, ip_vs_check_diff2(oldlen, newlen, csum_unfold(tcph->check)))); } INDIRECT_CALLABLE_SCOPE int tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct tcphdr *tcph; unsigned int tcphoff = iph->len; bool payload_csum = false; int oldlen; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6 && iph->fragoffs) return 1; #endif oldlen = skb->len - tcphoff; /* csum_check requires unshared skb */ if (skb_ensure_writable(skb, tcphoff + sizeof(*tcph))) return 0; if (unlikely(cp->app != NULL)) { int ret; /* Some checks before mangling */ if (!tcp_csum_check(cp->af, skb, pp)) return 0; /* Call application helper if needed */ if (!(ret = ip_vs_app_pkt_out(cp, skb, iph))) return 0; /* ret=2: csum update is needed after payload mangling */ if (ret == 1) oldlen = skb->len - tcphoff; else payload_csum = true; } tcph = (void *)skb_network_header(skb) + tcphoff; tcph->source = cp->vport; /* Adjust TCP checksums */ if (skb->ip_summed == CHECKSUM_PARTIAL) { tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, htons(oldlen), htons(skb->len - tcphoff)); } else if (!payload_csum) { /* Only port and addr are changed, do fast csum update */ tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, cp->dport, cp->vport); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = cp->app ? CHECKSUM_UNNECESSARY : CHECKSUM_NONE; } else { /* full checksum calculation */ tcph->check = 0; skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) tcph->check = csum_ipv6_magic(&cp->vaddr.in6, &cp->caddr.in6, skb->len - tcphoff, cp->protocol, skb->csum); else #endif tcph->check = csum_tcpudp_magic(cp->vaddr.ip, cp->caddr.ip, skb->len - tcphoff, cp->protocol, skb->csum); skb->ip_summed = CHECKSUM_UNNECESSARY; IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", pp->name, tcph->check, (char*)&(tcph->check) - (char*)tcph); } return 1; } static int tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct tcphdr *tcph; unsigned int tcphoff = iph->len; bool payload_csum = false; int oldlen; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6 && iph->fragoffs) return 1; #endif oldlen = skb->len - tcphoff; /* csum_check requires unshared skb */ if (skb_ensure_writable(skb, tcphoff + sizeof(*tcph))) return 0; if (unlikely(cp->app != NULL)) { int ret; /* Some checks before mangling */ if (!tcp_csum_check(cp->af, skb, pp)) return 0; /* * Attempt ip_vs_app call. * It will fix ip_vs_conn and iph ack_seq stuff */ if (!(ret = ip_vs_app_pkt_in(cp, skb, iph))) return 0; /* ret=2: csum update is needed after payload mangling */ if (ret == 1) oldlen = skb->len - tcphoff; else payload_csum = true; } tcph = (void *)skb_network_header(skb) + tcphoff; tcph->dest = cp->dport; /* * Adjust TCP checksums */ if (skb->ip_summed == CHECKSUM_PARTIAL) { tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, htons(oldlen), htons(skb->len - tcphoff)); } else if (!payload_csum) { /* Only port and addr are changed, do fast csum update */ tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, cp->vport, cp->dport); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = cp->app ? CHECKSUM_UNNECESSARY : CHECKSUM_NONE; } else { /* full checksum calculation */ tcph->check = 0; skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) tcph->check = csum_ipv6_magic(&cp->caddr.in6, &cp->daddr.in6, skb->len - tcphoff, cp->protocol, skb->csum); else #endif tcph->check = csum_tcpudp_magic(cp->caddr.ip, cp->daddr.ip, skb->len - tcphoff, cp->protocol, skb->csum); skb->ip_summed = CHECKSUM_UNNECESSARY; } return 1; } static int tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) { unsigned int tcphoff; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) tcphoff = sizeof(struct ipv6hdr); else #endif tcphoff = ip_hdrlen(skb); switch (skb->ip_summed) { case CHECKSUM_NONE: skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); fallthrough; case CHECKSUM_COMPLETE: #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->len - tcphoff, ipv6_hdr(skb)->nexthdr, skb->csum)) { IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } } else #endif if (csum_tcpudp_magic(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, skb->len - tcphoff, ip_hdr(skb)->protocol, skb->csum)) { IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } break; default: /* No need to checksum. */ break; } return 1; } #define TCP_DIR_INPUT 0 #define TCP_DIR_OUTPUT 4 #define TCP_DIR_INPUT_ONLY 8 static const int tcp_state_off[IP_VS_DIR_LAST] = { [IP_VS_DIR_INPUT] = TCP_DIR_INPUT, [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT, [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY, }; /* * Timeout table[state] */ static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { [IP_VS_TCP_S_NONE] = 2*HZ, [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ, [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ, [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ, [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ, [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ, [IP_VS_TCP_S_CLOSE] = 10*HZ, [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, [IP_VS_TCP_S_LAST_ACK] = 30*HZ, [IP_VS_TCP_S_LISTEN] = 2*60*HZ, [IP_VS_TCP_S_SYNACK] = 120*HZ, [IP_VS_TCP_S_LAST] = 2*HZ, }; static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { [IP_VS_TCP_S_NONE] = "NONE", [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT", [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV", [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT", [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT", [IP_VS_TCP_S_CLOSE] = "CLOSE", [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT", [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK", [IP_VS_TCP_S_LISTEN] = "LISTEN", [IP_VS_TCP_S_SYNACK] = "SYNACK", [IP_VS_TCP_S_LAST] = "BUG!", }; static const bool tcp_state_active_table[IP_VS_TCP_S_LAST] = { [IP_VS_TCP_S_NONE] = false, [IP_VS_TCP_S_ESTABLISHED] = true, [IP_VS_TCP_S_SYN_SENT] = true, [IP_VS_TCP_S_SYN_RECV] = true, [IP_VS_TCP_S_FIN_WAIT] = false, [IP_VS_TCP_S_TIME_WAIT] = false, [IP_VS_TCP_S_CLOSE] = false, [IP_VS_TCP_S_CLOSE_WAIT] = false, [IP_VS_TCP_S_LAST_ACK] = false, [IP_VS_TCP_S_LISTEN] = false, [IP_VS_TCP_S_SYNACK] = true, }; #define sNO IP_VS_TCP_S_NONE #define sES IP_VS_TCP_S_ESTABLISHED #define sSS IP_VS_TCP_S_SYN_SENT #define sSR IP_VS_TCP_S_SYN_RECV #define sFW IP_VS_TCP_S_FIN_WAIT #define sTW IP_VS_TCP_S_TIME_WAIT #define sCL IP_VS_TCP_S_CLOSE #define sCW IP_VS_TCP_S_CLOSE_WAIT #define sLA IP_VS_TCP_S_LAST_ACK #define sLI IP_VS_TCP_S_LISTEN #define sSA IP_VS_TCP_S_SYNACK struct tcp_states_t { int next_state[IP_VS_TCP_S_LAST]; }; static const char * tcp_state_name(int state) { if (state >= IP_VS_TCP_S_LAST) return "ERR!"; return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?"; } static bool tcp_state_active(int state) { if (state >= IP_VS_TCP_S_LAST) return false; return tcp_state_active_table[state]; } static struct tcp_states_t tcp_states[] = { /* INPUT */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }}, /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }}, /* OUTPUT */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }}, /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, /* INPUT-ONLY */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, }; static struct tcp_states_t tcp_states_dos[] = { /* INPUT */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }}, /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }}, /*ack*/ {{sES, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, /* OUTPUT */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }}, /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, /* INPUT-ONLY */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }}, /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, }; static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags) { int on = (flags & 1); /* secure_tcp */ /* ** FIXME: change secure_tcp to independent sysctl var ** or make it per-service or per-app because it is valid ** for most if not for all of the applications. Something ** like "capabilities" (flags) for each object. */ pd->tcp_state_table = (on ? tcp_states_dos : tcp_states); } static inline int tcp_state_idx(struct tcphdr *th) { if (th->rst) return 3; if (th->syn) return 0; if (th->fin) return 1; if (th->ack) return 2; return -1; } static inline void set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, int direction, struct tcphdr *th) { int state_idx; int new_state = IP_VS_TCP_S_CLOSE; int state_off = tcp_state_off[direction]; /* * Update state offset to INPUT_ONLY if necessary * or delete NO_OUTPUT flag if output packet detected */ if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { if (state_off == TCP_DIR_OUTPUT) cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; else state_off = TCP_DIR_INPUT_ONLY; } if ((state_idx = tcp_state_idx(th)) < 0) { IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx); goto tcp_state_out; } new_state = pd->tcp_state_table[state_off+state_idx].next_state[cp->state]; tcp_state_out: if (new_state != cp->state) { struct ip_vs_dest *dest = cp->dest; IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] c:%s:%d v:%s:%d " "d:%s:%d state: %s->%s conn->refcnt:%d\n", pd->pp->name, ((state_off == TCP_DIR_OUTPUT) ? "output " : "input "), th->syn ? 'S' : '.', th->fin ? 'F' : '.', th->ack ? 'A' : '.', th->rst ? 'R' : '.', IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), tcp_state_name(cp->state), tcp_state_name(new_state), refcount_read(&cp->refcnt)); if (dest) { if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && !tcp_state_active(new_state)) { atomic_dec(&dest->activeconns); atomic_inc(&dest->inactconns); cp->flags |= IP_VS_CONN_F_INACTIVE; } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && tcp_state_active(new_state)) { atomic_inc(&dest->activeconns); atomic_dec(&dest->inactconns); cp->flags &= ~IP_VS_CONN_F_INACTIVE; } } if (new_state == IP_VS_TCP_S_ESTABLISHED) ip_vs_control_assure_ct(cp); } if (likely(pd)) cp->timeout = pd->timeout_table[cp->state = new_state]; else /* What to do ? */ cp->timeout = tcp_timeouts[cp->state = new_state]; } /* * Handle state transitions */ static void tcp_state_transition(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, struct ip_vs_proto_data *pd) { struct tcphdr _tcph, *th; #ifdef CONFIG_IP_VS_IPV6 int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); #else int ihl = ip_hdrlen(skb); #endif th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph); if (th == NULL) return; spin_lock_bh(&cp->lock); set_tcp_state(pd, cp, direction, th); spin_unlock_bh(&cp->lock); } static inline __u16 tcp_app_hashkey(__be16 port) { return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port) & TCP_APP_TAB_MASK; } static int tcp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { struct ip_vs_app *i; __u16 hash; __be16 port = inc->port; int ret = 0; struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); hash = tcp_app_hashkey(port); list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) { if (i->port == port) { ret = -EEXIST; goto out; } } list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]); atomic_inc(&pd->appcnt); out: return ret; } static void tcp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); atomic_dec(&pd->appcnt); list_del_rcu(&inc->p_list); } static int tcp_app_conn_bind(struct ip_vs_conn *cp) { struct netns_ipvs *ipvs = cp->ipvs; int hash; struct ip_vs_app *inc; int result = 0; /* Default binding: bind app only for NAT */ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) return 0; /* Lookup application incarnations and bind the right one */ hash = tcp_app_hashkey(cp->vport); list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) { if (inc->port == cp->vport) { if (unlikely(!ip_vs_app_inc_get(inc))) break; IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" "%s:%u to app %s on port %u\n", __func__, IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), inc->name, ntohs(inc->port)); cp->app = inc; if (inc->init_conn) result = inc->init_conn(inc, cp); break; } } return result; } /* * Set LISTEN timeout. (ip_vs_conn_put will setup timer) */ void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp) { struct ip_vs_proto_data *pd = ip_vs_proto_data_get(cp->ipvs, IPPROTO_TCP); spin_lock_bh(&cp->lock); cp->state = IP_VS_TCP_S_LISTEN; cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN] : tcp_timeouts[IP_VS_TCP_S_LISTEN]); spin_unlock_bh(&cp->lock); } /* --------------------------------------------- * timeouts is netns related now. * --------------------------------------------- */ static int __ip_vs_tcp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE); pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts, sizeof(tcp_timeouts)); if (!pd->timeout_table) return -ENOMEM; pd->tcp_state_table = tcp_states; return 0; } static void __ip_vs_tcp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { kfree(pd->timeout_table); } struct ip_vs_protocol ip_vs_protocol_tcp = { .name = "TCP", .protocol = IPPROTO_TCP, .num_states = IP_VS_TCP_S_LAST, .dont_defrag = 0, .init = NULL, .exit = NULL, .init_netns = __ip_vs_tcp_init, .exit_netns = __ip_vs_tcp_exit, .register_app = tcp_register_app, .unregister_app = tcp_unregister_app, .conn_schedule = tcp_conn_schedule, .conn_in_get = ip_vs_conn_in_get_proto, .conn_out_get = ip_vs_conn_out_get_proto, .snat_handler = tcp_snat_handler, .dnat_handler = tcp_dnat_handler, .state_name = tcp_state_name, .state_transition = tcp_state_transition, .app_conn_bind = tcp_app_conn_bind, .debug_packet = ip_vs_tcpudp_debug_packet, .timeout_change = tcp_timeout_change, }; |
| 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 6 6 7 7 7 7 7 7 7 6 3 3 4 4 6 7 7 7 7 7 7 7 7 7 7 7 5 7 1 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 | // SPDX-License-Identifier: GPL-2.0+ /* Keyspan USB to Serial Converter driver (C) Copyright (C) 2000-2001 Hugh Blemings <hugh@blemings.org> (C) Copyright (C) 2002 Greg Kroah-Hartman <greg@kroah.com> See http://blemings.org/hugh/keyspan.html for more information. Code in this driver inspired by and in a number of places taken from Brian Warner's original Keyspan-PDA driver. This driver has been put together with the support of Innosys, Inc. and Keyspan, Inc the manufacturers of the Keyspan USB-serial products. Thanks Guys :) Thanks to Paulus for miscellaneous tidy ups, some largish chunks of much nicer and/or completely new code and (perhaps most uniquely) having the patience to sit down and explain why and where he'd changed stuff. Tip 'o the hat to IBM (and previously Linuxcare :) for supporting staff in their work on open source projects. */ #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/uaccess.h> #include <linux/usb.h> #include <linux/usb/serial.h> #include <linux/usb/ezusb.h> #define DRIVER_AUTHOR "Hugh Blemings <hugh@misc.nu" #define DRIVER_DESC "Keyspan USB to Serial Converter Driver" static void keyspan_send_setup(struct usb_serial_port *port, int reset_port); static int keyspan_usa19_calc_baud(struct usb_serial_port *port, u32 baud_rate, u32 baudclk, u8 *rate_hi, u8 *rate_low, u8 *prescaler, int portnum); static int keyspan_usa19w_calc_baud(struct usb_serial_port *port, u32 baud_rate, u32 baudclk, u8 *rate_hi, u8 *rate_low, u8 *prescaler, int portnum); static int keyspan_usa28_calc_baud(struct usb_serial_port *port, u32 baud_rate, u32 baudclk, u8 *rate_hi, u8 *rate_low, u8 *prescaler, int portnum); static int keyspan_usa19hs_calc_baud(struct usb_serial_port *port, u32 baud_rate, u32 baudclk, u8 *rate_hi, u8 *rate_low, u8 *prescaler, int portnum); static int keyspan_usa28_send_setup(struct usb_serial *serial, struct usb_serial_port *port, int reset_port); static int keyspan_usa26_send_setup(struct usb_serial *serial, struct usb_serial_port *port, int reset_port); static int keyspan_usa49_send_setup(struct usb_serial *serial, struct usb_serial_port *port, int reset_port); static int keyspan_usa90_send_setup(struct usb_serial *serial, struct usb_serial_port *port, int reset_port); static int keyspan_usa67_send_setup(struct usb_serial *serial, struct usb_serial_port *port, int reset_port); /* Values used for baud rate calculation - device specific */ #define KEYSPAN_INVALID_BAUD_RATE (-1) #define KEYSPAN_BAUD_RATE_OK (0) #define KEYSPAN_USA18X_BAUDCLK (12000000L) /* a guess */ #define KEYSPAN_USA19_BAUDCLK (12000000L) #define KEYSPAN_USA19W_BAUDCLK (24000000L) #define KEYSPAN_USA19HS_BAUDCLK (14769231L) #define KEYSPAN_USA28_BAUDCLK (1843200L) #define KEYSPAN_USA28X_BAUDCLK (12000000L) #define KEYSPAN_USA49W_BAUDCLK (48000000L) /* Some constants used to characterise each device. */ #define KEYSPAN_MAX_NUM_PORTS (4) #define KEYSPAN_MAX_FLIPS (2) /* * Device info for the Keyspan serial converter, used by the overall * usb-serial probe function. */ #define KEYSPAN_VENDOR_ID (0x06cd) /* Product IDs for the products supported, pre-renumeration */ #define keyspan_usa18x_pre_product_id 0x0105 #define keyspan_usa19_pre_product_id 0x0103 #define keyspan_usa19qi_pre_product_id 0x010b #define keyspan_mpr_pre_product_id 0x011b #define keyspan_usa19qw_pre_product_id 0x0118 #define keyspan_usa19w_pre_product_id 0x0106 #define keyspan_usa28_pre_product_id 0x0101 #define keyspan_usa28x_pre_product_id 0x0102 #define keyspan_usa28xa_pre_product_id 0x0114 #define keyspan_usa28xb_pre_product_id 0x0113 #define keyspan_usa49w_pre_product_id 0x0109 #define keyspan_usa49wlc_pre_product_id 0x011a /* * Product IDs post-renumeration. Note that the 28x and 28xb have the same * id's post-renumeration but behave identically so it's not an issue. As * such, the 28xb is not listed in any of the device tables. */ #define keyspan_usa18x_product_id 0x0112 #define keyspan_usa19_product_id 0x0107 #define keyspan_usa19qi_product_id 0x010c #define keyspan_usa19hs_product_id 0x0121 #define keyspan_mpr_product_id 0x011c #define keyspan_usa19qw_product_id 0x0119 #define keyspan_usa19w_product_id 0x0108 #define keyspan_usa28_product_id 0x010f #define keyspan_usa28x_product_id 0x0110 #define keyspan_usa28xa_product_id 0x0115 #define keyspan_usa28xb_product_id 0x0110 #define keyspan_usa28xg_product_id 0x0135 #define keyspan_usa49w_product_id 0x010a #define keyspan_usa49wlc_product_id 0x012a #define keyspan_usa49wg_product_id 0x0131 struct keyspan_device_details { /* product ID value */ int product_id; enum {msg_usa26, msg_usa28, msg_usa49, msg_usa90, msg_usa67} msg_format; /* Number of physical ports */ int num_ports; /* 1 if endpoint flipping used on input, 0 if not */ int indat_endp_flip; /* 1 if endpoint flipping used on output, 0 if not */ int outdat_endp_flip; /* * Table mapping input data endpoint IDs to physical port * number and flip if used */ int indat_endpoints[KEYSPAN_MAX_NUM_PORTS]; /* Same for output endpoints */ int outdat_endpoints[KEYSPAN_MAX_NUM_PORTS]; /* Input acknowledge endpoints */ int inack_endpoints[KEYSPAN_MAX_NUM_PORTS]; /* Output control endpoints */ int outcont_endpoints[KEYSPAN_MAX_NUM_PORTS]; /* Endpoint used for input status */ int instat_endpoint; /* Endpoint used for input data 49WG only */ int indat_endpoint; /* Endpoint used for global control functions */ int glocont_endpoint; int (*calculate_baud_rate)(struct usb_serial_port *port, u32 baud_rate, u32 baudclk, u8 *rate_hi, u8 *rate_low, u8 *prescaler, int portnum); u32 baudclk; }; /* * Now for each device type we setup the device detail structure with the * appropriate information (provided in Keyspan's documentation) */ static const struct keyspan_device_details usa18x_device_details = { .product_id = keyspan_usa18x_product_id, .msg_format = msg_usa26, .num_ports = 1, .indat_endp_flip = 0, .outdat_endp_flip = 1, .indat_endpoints = {0x81}, .outdat_endpoints = {0x01}, .inack_endpoints = {0x85}, .outcont_endpoints = {0x05}, .instat_endpoint = 0x87, .indat_endpoint = -1, .glocont_endpoint = 0x07, .calculate_baud_rate = keyspan_usa19w_calc_baud, .baudclk = KEYSPAN_USA18X_BAUDCLK, }; static const struct keyspan_device_details usa19_device_details = { .product_id = keyspan_usa19_product_id, .msg_format = msg_usa28, .num_ports = 1, .indat_endp_flip = 1, .outdat_endp_flip = 1, .indat_endpoints = {0x81}, .outdat_endpoints = {0x01}, .inack_endpoints = {0x83}, .outcont_endpoints = {0x03}, .instat_endpoint = 0x84, .indat_endpoint = -1, .glocont_endpoint = -1, .calculate_baud_rate = keyspan_usa19_calc_baud, .baudclk = KEYSPAN_USA19_BAUDCLK, }; static const struct keyspan_device_details usa19qi_device_details = { .product_id = keyspan_usa19qi_product_id, .msg_format = msg_usa28, .num_ports = 1, .indat_endp_flip = 1, .outdat_endp_flip = 1, .indat_endpoints = {0x81}, .outdat_endpoints = {0x01}, .inack_endpoints = {0x83}, .outcont_endpoints = {0x03}, .instat_endpoint = 0x84, .indat_endpoint = -1, .glocont_endpoint = -1, .calculate_baud_rate = keyspan_usa28_calc_baud, .baudclk = KEYSPAN_USA19_BAUDCLK, }; static const struct keyspan_device_details mpr_device_details = { .product_id = keyspan_mpr_product_id, .msg_format = msg_usa28, .num_ports = 1, .indat_endp_flip = 1, .outdat_endp_flip = 1, .indat_endpoints = {0x81}, .outdat_endpoints = {0x01}, .inack_endpoints = {0x83}, .outcont_endpoints = {0x03}, .instat_endpoint = 0x84, .indat_endpoint = -1, .glocont_endpoint = -1, .calculate_baud_rate = keyspan_usa28_calc_baud, .baudclk = KEYSPAN_USA19_BAUDCLK, }; static const struct keyspan_device_details usa19qw_device_details = { .product_id = keyspan_usa19qw_product_id, .msg_format = msg_usa26, .num_ports = 1, .indat_endp_flip = 0, .outdat_endp_flip = 1, .indat_endpoints = {0x81}, .outdat_endpoints = {0x01}, .inack_endpoints = {0x85}, .outcont_endpoints = {0x05}, .instat_endpoint = 0x87, .indat_endpoint = -1, .glocont_endpoint = 0x07, .calculate_baud_rate = keyspan_usa19w_calc_baud, .baudclk = KEYSPAN_USA19W_BAUDCLK, }; static const struct keyspan_device_details usa19w_device_details = { .product_id = keyspan_usa19w_product_id, .msg_format = msg_usa26, .num_ports = 1, .indat_endp_flip = 0, .outdat_endp_flip = 1, .indat_endpoints = {0x81}, .outdat_endpoints = {0x01}, .inack_endpoints = {0x85}, .outcont_endpoints = {0x05}, .instat_endpoint = 0x87, .indat_endpoint = -1, .glocont_endpoint = 0x07, .calculate_baud_rate = keyspan_usa19w_calc_baud, .baudclk = KEYSPAN_USA19W_BAUDCLK, }; static const struct keyspan_device_details usa19hs_device_details = { .product_id = keyspan_usa19hs_product_id, .msg_format = msg_usa90, .num_ports = 1, .indat_endp_flip = 0, .outdat_endp_flip = 0, .indat_endpoints = {0x81}, .outdat_endpoints = {0x01}, .inack_endpoints = {-1}, .outcont_endpoints = {0x02}, .instat_endpoint = 0x82, .indat_endpoint = -1, .glocont_endpoint = -1, .calculate_baud_rate = keyspan_usa19hs_calc_baud, .baudclk = KEYSPAN_USA19HS_BAUDCLK, }; static const struct keyspan_device_details usa28_device_details = { .product_id = keyspan_usa28_product_id, .msg_format = msg_usa28, .num_ports = 2, .indat_endp_flip = 1, .outdat_endp_flip = 1, .indat_endpoints = {0x81, 0x83}, .outdat_endpoints = {0x01, 0x03}, .inack_endpoints = {0x85, 0x86}, .outcont_endpoints = {0x05, 0x06}, .instat_endpoint = 0x87, .indat_endpoint = -1, .glocont_endpoint = 0x07, .calculate_baud_rate = keyspan_usa28_calc_baud, .baudclk = KEYSPAN_USA28_BAUDCLK, }; static const struct keyspan_device_details usa28x_device_details = { .product_id = keyspan_usa28x_product_id, .msg_format = msg_usa26, .num_ports = 2, .indat_endp_flip = 0, .outdat_endp_flip = 1, .indat_endpoints = {0x81, 0x83}, .outdat_endpoints = {0x01, 0x03}, .inack_endpoints = {0x85, 0x86}, .outcont_endpoints = {0x05, 0x06}, .instat_endpoint = 0x87, .indat_endpoint = -1, .glocont_endpoint = 0x07, .calculate_baud_rate = keyspan_usa19w_calc_baud, .baudclk = KEYSPAN_USA28X_BAUDCLK, }; static const struct keyspan_device_details usa28xa_device_details = { .product_id = keyspan_usa28xa_product_id, .msg_format = msg_usa26, .num_ports = 2, .indat_endp_flip = 0, .outdat_endp_flip = 1, .indat_endpoints = {0x81, 0x83}, .outdat_endpoints = {0x01, 0x03}, .inack_endpoints = {0x85, 0x86}, .outcont_endpoints = {0x05, 0x06}, .instat_endpoint = 0x87, .indat_endpoint = -1, .glocont_endpoint = 0x07, .calculate_baud_rate = keyspan_usa19w_calc_baud, .baudclk = KEYSPAN_USA28X_BAUDCLK, }; static const struct keyspan_device_details usa28xg_device_details = { .product_id = keyspan_usa28xg_product_id, .msg_format = msg_usa67, .num_ports = 2, .indat_endp_flip = 0, .outdat_endp_flip = 0, .indat_endpoints = {0x84, 0x88}, .outdat_endpoints = {0x02, 0x06}, .inack_endpoints = {-1, -1}, .outcont_endpoints = {-1, -1}, .instat_endpoint = 0x81, .indat_endpoint = -1, .glocont_endpoint = 0x01, .calculate_baud_rate = keyspan_usa19w_calc_baud, .baudclk = KEYSPAN_USA28X_BAUDCLK, }; /* * We don't need a separate entry for the usa28xb as it appears as a 28x * anyway. */ static const struct keyspan_device_details usa49w_device_details = { .product_id = keyspan_usa49w_product_id, .msg_format = msg_usa49, .num_ports = 4, .indat_endp_flip = 0, .outdat_endp_flip = 0, .indat_endpoints = {0x81, 0x82, 0x83, 0x84}, .outdat_endpoints = {0x01, 0x02, 0x03, 0x04}, .inack_endpoints = {-1, -1, -1, -1}, .outcont_endpoints = {-1, -1, -1, -1}, .instat_endpoint = 0x87, .indat_endpoint = -1, .glocont_endpoint = 0x07, .calculate_baud_rate = keyspan_usa19w_calc_baud, .baudclk = KEYSPAN_USA49W_BAUDCLK, }; static const struct keyspan_device_details usa49wlc_device_details = { .product_id = keyspan_usa49wlc_product_id, .msg_format = msg_usa49, .num_ports = 4, .indat_endp_flip = 0, .outdat_endp_flip = 0, .indat_endpoints = {0x81, 0x82, 0x83, 0x84}, .outdat_endpoints = {0x01, 0x02, 0x03, 0x04}, .inack_endpoints = {-1, -1, -1, -1}, .outcont_endpoints = {-1, -1, -1, -1}, .instat_endpoint = 0x87, .indat_endpoint = -1, .glocont_endpoint = 0x07, .calculate_baud_rate = keyspan_usa19w_calc_baud, .baudclk = KEYSPAN_USA19W_BAUDCLK, }; static const struct keyspan_device_details usa49wg_device_details = { .product_id = keyspan_usa49wg_product_id, .msg_format = msg_usa49, .num_ports = 4, .indat_endp_flip = 0, .outdat_endp_flip = 0, .indat_endpoints = {-1, -1, -1, -1}, /* single 'global' data in EP */ .outdat_endpoints = {0x01, 0x02, 0x04, 0x06}, .inack_endpoints = {-1, -1, -1, -1}, .outcont_endpoints = {-1, -1, -1, -1}, .instat_endpoint = 0x81, .indat_endpoint = 0x88, .glocont_endpoint = 0x00, /* uses control EP */ .calculate_baud_rate = keyspan_usa19w_calc_baud, .baudclk = KEYSPAN_USA19W_BAUDCLK, }; static const struct keyspan_device_details *keyspan_devices[] = { &usa18x_device_details, &usa19_device_details, &usa19qi_device_details, &mpr_device_details, &usa19qw_device_details, &usa19w_device_details, &usa19hs_device_details, &usa28_device_details, &usa28x_device_details, &usa28xa_device_details, &usa28xg_device_details, /* 28xb not required as it renumerates as a 28x */ &usa49w_device_details, &usa49wlc_device_details, &usa49wg_device_details, NULL, }; static const struct usb_device_id keyspan_ids_combined[] = { { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa18x_pre_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa19_pre_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa19w_pre_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa19qi_pre_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa19qw_pre_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_mpr_pre_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa28_pre_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa28x_pre_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa28xa_pre_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa28xb_pre_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa49w_pre_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa49wlc_pre_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa18x_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa19_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa19w_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa19qi_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa19qw_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa19hs_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_mpr_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa28_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa28x_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa28xa_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa28xg_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa49w_product_id)}, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa49wlc_product_id)}, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa49wg_product_id)}, { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, keyspan_ids_combined); /* usb_device_id table for the pre-firmware download keyspan devices */ static const struct usb_device_id keyspan_pre_ids[] = { { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa18x_pre_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa19_pre_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa19qi_pre_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa19qw_pre_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa19w_pre_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_mpr_pre_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa28_pre_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa28x_pre_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa28xa_pre_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa28xb_pre_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa49w_pre_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa49wlc_pre_product_id) }, { } /* Terminating entry */ }; static const struct usb_device_id keyspan_1port_ids[] = { { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa18x_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa19_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa19qi_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa19qw_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa19w_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa19hs_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_mpr_product_id) }, { } /* Terminating entry */ }; static const struct usb_device_id keyspan_2port_ids[] = { { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa28_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa28x_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa28xa_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa28xg_product_id) }, { } /* Terminating entry */ }; static const struct usb_device_id keyspan_4port_ids[] = { { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa49w_product_id) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa49wlc_product_id)}, { USB_DEVICE(KEYSPAN_VENDOR_ID, keyspan_usa49wg_product_id)}, { } /* Terminating entry */ }; #define INSTAT_BUFLEN 32 #define GLOCONT_BUFLEN 64 #define INDAT49W_BUFLEN 512 #define IN_BUFLEN 64 #define OUT_BUFLEN 64 #define INACK_BUFLEN 1 #define OUTCONT_BUFLEN 64 /* Per device and per port private data */ struct keyspan_serial_private { const struct keyspan_device_details *device_details; struct urb *instat_urb; char *instat_buf; /* added to support 49wg, where data from all 4 ports comes in on 1 EP and high-speed supported */ struct urb *indat_urb; char *indat_buf; /* XXX this one probably will need a lock */ struct urb *glocont_urb; char *glocont_buf; char *ctrl_buf; /* for EP0 control message */ }; struct keyspan_port_private { /* Keep track of which input & output endpoints to use */ int in_flip; int out_flip; /* Keep duplicate of device details in each port structure as well - simplifies some of the callback functions etc. */ const struct keyspan_device_details *device_details; /* Input endpoints and buffer for this port */ struct urb *in_urbs[2]; char *in_buffer[2]; /* Output endpoints and buffer for this port */ struct urb *out_urbs[2]; char *out_buffer[2]; /* Input ack endpoint */ struct urb *inack_urb; char *inack_buffer; /* Output control endpoint */ struct urb *outcont_urb; char *outcont_buffer; /* Settings for the port */ int baud; int old_baud; unsigned int cflag; unsigned int old_cflag; enum {flow_none, flow_cts, flow_xon} flow_control; int rts_state; /* Handshaking pins (outputs) */ int dtr_state; int cts_state; /* Handshaking pins (inputs) */ int dsr_state; int dcd_state; int ri_state; int break_on; unsigned long tx_start_time[2]; int resend_cont; /* need to resend control packet */ }; /* Include Keyspan message headers. All current Keyspan Adapters make use of one of five message formats which are referred to as USA-26, USA-28, USA-49, USA-90, USA-67 by Keyspan and within this driver. */ #include "keyspan_usa26msg.h" #include "keyspan_usa28msg.h" #include "keyspan_usa49msg.h" #include "keyspan_usa90msg.h" #include "keyspan_usa67msg.h" static int keyspan_break_ctl(struct tty_struct *tty, int break_state) { struct usb_serial_port *port = tty->driver_data; struct keyspan_port_private *p_priv; p_priv = usb_get_serial_port_data(port); if (break_state == -1) p_priv->break_on = 1; else p_priv->break_on = 0; /* FIXME: return errors */ keyspan_send_setup(port, 0); return 0; } static void keyspan_set_termios(struct tty_struct *tty, struct usb_serial_port *port, const struct ktermios *old_termios) { int baud_rate, device_port; struct keyspan_port_private *p_priv; const struct keyspan_device_details *d_details; unsigned int cflag; p_priv = usb_get_serial_port_data(port); d_details = p_priv->device_details; cflag = tty->termios.c_cflag; device_port = port->port_number; /* Baud rate calculation takes baud rate as an integer so other rates can be generated if desired. */ baud_rate = tty_get_baud_rate(tty); /* If no match or invalid, don't change */ if (d_details->calculate_baud_rate(port, baud_rate, d_details->baudclk, NULL, NULL, NULL, device_port) == KEYSPAN_BAUD_RATE_OK) { /* FIXME - more to do here to ensure rate changes cleanly */ /* FIXME - calculate exact rate from divisor ? */ p_priv->baud = baud_rate; } else baud_rate = tty_termios_baud_rate(old_termios); tty_encode_baud_rate(tty, baud_rate, baud_rate); /* set CTS/RTS handshake etc. */ p_priv->cflag = cflag; p_priv->flow_control = (cflag & CRTSCTS) ? flow_cts : flow_none; /* Mark/Space not supported */ tty->termios.c_cflag &= ~CMSPAR; keyspan_send_setup(port, 0); } static int keyspan_tiocmget(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct keyspan_port_private *p_priv = usb_get_serial_port_data(port); unsigned int value; value = ((p_priv->rts_state) ? TIOCM_RTS : 0) | ((p_priv->dtr_state) ? TIOCM_DTR : 0) | ((p_priv->cts_state) ? TIOCM_CTS : 0) | ((p_priv->dsr_state) ? TIOCM_DSR : 0) | ((p_priv->dcd_state) ? TIOCM_CAR : 0) | ((p_priv->ri_state) ? TIOCM_RNG : 0); return value; } static int keyspan_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear) { struct usb_serial_port *port = tty->driver_data; struct keyspan_port_private *p_priv = usb_get_serial_port_data(port); if (set & TIOCM_RTS) p_priv->rts_state = 1; if (set & TIOCM_DTR) p_priv->dtr_state = 1; if (clear & TIOCM_RTS) p_priv->rts_state = 0; if (clear & TIOCM_DTR) p_priv->dtr_state = 0; keyspan_send_setup(port, 0); return 0; } /* Write function is similar for the four protocols used with only a minor change for usa90 (usa19hs) required */ static int keyspan_write(struct tty_struct *tty, struct usb_serial_port *port, const unsigned char *buf, int count) { struct keyspan_port_private *p_priv; const struct keyspan_device_details *d_details; int flip; int left, todo; struct urb *this_urb; int err, maxDataLen, dataOffset; p_priv = usb_get_serial_port_data(port); d_details = p_priv->device_details; if (d_details->msg_format == msg_usa90) { maxDataLen = 64; dataOffset = 0; } else { maxDataLen = 63; dataOffset = 1; } dev_dbg(&port->dev, "%s - %d chars, flip=%d\n", __func__, count, p_priv->out_flip); for (left = count; left > 0; left -= todo) { todo = left; if (todo > maxDataLen) todo = maxDataLen; flip = p_priv->out_flip; /* Check we have a valid urb/endpoint before we use it... */ this_urb = p_priv->out_urbs[flip]; if (this_urb == NULL) { /* no bulk out, so return 0 bytes written */ dev_dbg(&port->dev, "%s - no output urb :(\n", __func__); return count; } dev_dbg(&port->dev, "%s - endpoint %x flip %d\n", __func__, usb_pipeendpoint(this_urb->pipe), flip); if (this_urb->status == -EINPROGRESS) { if (time_before(jiffies, p_priv->tx_start_time[flip] + 10 * HZ)) break; usb_unlink_urb(this_urb); break; } /* First byte in buffer is "last flag" (except for usa19hx) - unused so for now so set to zero */ ((char *)this_urb->transfer_buffer)[0] = 0; memcpy(this_urb->transfer_buffer + dataOffset, buf, todo); buf += todo; /* send the data out the bulk port */ this_urb->transfer_buffer_length = todo + dataOffset; err = usb_submit_urb(this_urb, GFP_ATOMIC); if (err != 0) dev_dbg(&port->dev, "usb_submit_urb(write bulk) failed (%d)\n", err); p_priv->tx_start_time[flip] = jiffies; /* Flip for next time if usa26 or usa28 interface (not used on usa49) */ p_priv->out_flip = (flip + 1) & d_details->outdat_endp_flip; } return count - left; } static void usa26_indat_callback(struct urb *urb) { int i, err; int endpoint; struct usb_serial_port *port; unsigned char *data = urb->transfer_buffer; int status = urb->status; endpoint = usb_pipeendpoint(urb->pipe); if (status) { dev_dbg(&urb->dev->dev, "%s - nonzero status %d on endpoint %x\n", __func__, status, endpoint); return; } port = urb->context; if (urb->actual_length) { /* 0x80 bit is error flag */ if ((data[0] & 0x80) == 0) { /* no errors on individual bytes, only possible overrun err */ if (data[0] & RXERROR_OVERRUN) { tty_insert_flip_char(&port->port, 0, TTY_OVERRUN); } for (i = 1; i < urb->actual_length ; ++i) tty_insert_flip_char(&port->port, data[i], TTY_NORMAL); } else { /* some bytes had errors, every byte has status */ dev_dbg(&port->dev, "%s - RX error!!!!\n", __func__); for (i = 0; i + 1 < urb->actual_length; i += 2) { int stat = data[i]; int flag = TTY_NORMAL; if (stat & RXERROR_OVERRUN) { tty_insert_flip_char(&port->port, 0, TTY_OVERRUN); } /* XXX should handle break (0x10) */ if (stat & RXERROR_PARITY) flag = TTY_PARITY; else if (stat & RXERROR_FRAMING) flag = TTY_FRAME; tty_insert_flip_char(&port->port, data[i+1], flag); } } tty_flip_buffer_push(&port->port); } /* Resubmit urb so we continue receiving */ err = usb_submit_urb(urb, GFP_ATOMIC); if (err != 0) dev_dbg(&port->dev, "%s - resubmit read urb failed. (%d)\n", __func__, err); } /* Outdat handling is common for all devices */ static void usa2x_outdat_callback(struct urb *urb) { struct usb_serial_port *port; struct keyspan_port_private *p_priv; port = urb->context; p_priv = usb_get_serial_port_data(port); dev_dbg(&port->dev, "%s - urb %d\n", __func__, urb == p_priv->out_urbs[1]); usb_serial_port_softint(port); } static void usa26_inack_callback(struct urb *urb) { } static void usa26_outcont_callback(struct urb *urb) { struct usb_serial_port *port; struct keyspan_port_private *p_priv; port = urb->context; p_priv = usb_get_serial_port_data(port); if (p_priv->resend_cont) { dev_dbg(&port->dev, "%s - sending setup\n", __func__); keyspan_usa26_send_setup(port->serial, port, p_priv->resend_cont - 1); } } static void usa26_instat_callback(struct urb *urb) { unsigned char *data = urb->transfer_buffer; struct keyspan_usa26_portStatusMessage *msg; struct usb_serial *serial; struct usb_serial_port *port; struct keyspan_port_private *p_priv; int old_dcd_state, err; int status = urb->status; serial = urb->context; if (status) { dev_dbg(&urb->dev->dev, "%s - nonzero status: %d\n", __func__, status); return; } if (urb->actual_length != 9) { dev_dbg(&urb->dev->dev, "%s - %d byte report??\n", __func__, urb->actual_length); goto exit; } msg = (struct keyspan_usa26_portStatusMessage *)data; /* Check port number from message and retrieve private data */ if (msg->port >= serial->num_ports) { dev_dbg(&urb->dev->dev, "%s - Unexpected port number %d\n", __func__, msg->port); goto exit; } port = serial->port[msg->port]; p_priv = usb_get_serial_port_data(port); if (!p_priv) goto resubmit; /* Update handshaking pin state information */ old_dcd_state = p_priv->dcd_state; p_priv->cts_state = ((msg->hskia_cts) ? 1 : 0); p_priv->dsr_state = ((msg->dsr) ? 1 : 0); p_priv->dcd_state = ((msg->gpia_dcd) ? 1 : 0); p_priv->ri_state = ((msg->ri) ? 1 : 0); if (old_dcd_state != p_priv->dcd_state) tty_port_tty_hangup(&port->port, true); resubmit: /* Resubmit urb so we continue receiving */ err = usb_submit_urb(urb, GFP_ATOMIC); if (err != 0) dev_dbg(&port->dev, "%s - resubmit read urb failed. (%d)\n", __func__, err); exit: ; } static void usa26_glocont_callback(struct urb *urb) { } static void usa28_indat_callback(struct urb *urb) { int err; struct usb_serial_port *port; unsigned char *data; struct keyspan_port_private *p_priv; int status = urb->status; port = urb->context; p_priv = usb_get_serial_port_data(port); if (urb != p_priv->in_urbs[p_priv->in_flip]) return; do { if (status) { dev_dbg(&urb->dev->dev, "%s - nonzero status %d on endpoint %x\n", __func__, status, usb_pipeendpoint(urb->pipe)); return; } port = urb->context; p_priv = usb_get_serial_port_data(port); data = urb->transfer_buffer; if (urb->actual_length) { tty_insert_flip_string(&port->port, data, urb->actual_length); tty_flip_buffer_push(&port->port); } /* Resubmit urb so we continue receiving */ err = usb_submit_urb(urb, GFP_ATOMIC); if (err != 0) dev_dbg(&port->dev, "%s - resubmit read urb failed. (%d)\n", __func__, err); p_priv->in_flip ^= 1; urb = p_priv->in_urbs[p_priv->in_flip]; } while (urb->status != -EINPROGRESS); } static void usa28_inack_callback(struct urb *urb) { } static void usa28_outcont_callback(struct urb *urb) { struct usb_serial_port *port; struct keyspan_port_private *p_priv; port = urb->context; p_priv = usb_get_serial_port_data(port); if (p_priv->resend_cont) { dev_dbg(&port->dev, "%s - sending setup\n", __func__); keyspan_usa28_send_setup(port->serial, port, p_priv->resend_cont - 1); } } static void usa28_instat_callback(struct urb *urb) { int err; unsigned char *data = urb->transfer_buffer; struct keyspan_usa28_portStatusMessage *msg; struct usb_serial *serial; struct usb_serial_port *port; struct keyspan_port_private *p_priv; int old_dcd_state; int status = urb->status; serial = urb->context; if (status) { dev_dbg(&urb->dev->dev, "%s - nonzero status: %d\n", __func__, status); return; } if (urb->actual_length != sizeof(struct keyspan_usa28_portStatusMessage)) { dev_dbg(&urb->dev->dev, "%s - bad length %d\n", __func__, urb->actual_length); goto exit; } msg = (struct keyspan_usa28_portStatusMessage *)data; /* Check port number from message and retrieve private data */ if (msg->port >= serial->num_ports) { dev_dbg(&urb->dev->dev, "%s - Unexpected port number %d\n", __func__, msg->port); goto exit; } port = serial->port[msg->port]; p_priv = usb_get_serial_port_data(port); if (!p_priv) goto resubmit; /* Update handshaking pin state information */ old_dcd_state = p_priv->dcd_state; p_priv->cts_state = ((msg->cts) ? 1 : 0); p_priv->dsr_state = ((msg->dsr) ? 1 : 0); p_priv->dcd_state = ((msg->dcd) ? 1 : 0); p_priv->ri_state = ((msg->ri) ? 1 : 0); if (old_dcd_state != p_priv->dcd_state && old_dcd_state) tty_port_tty_hangup(&port->port, true); resubmit: /* Resubmit urb so we continue receiving */ err = usb_submit_urb(urb, GFP_ATOMIC); if (err != 0) dev_dbg(&port->dev, "%s - resubmit read urb failed. (%d)\n", __func__, err); exit: ; } static void usa28_glocont_callback(struct urb *urb) { } static void usa49_glocont_callback(struct urb *urb) { struct usb_serial *serial; struct usb_serial_port *port; struct keyspan_port_private *p_priv; int i; serial = urb->context; for (i = 0; i < serial->num_ports; ++i) { port = serial->port[i]; p_priv = usb_get_serial_port_data(port); if (!p_priv) continue; if (p_priv->resend_cont) { dev_dbg(&port->dev, "%s - sending setup\n", __func__); keyspan_usa49_send_setup(serial, port, p_priv->resend_cont - 1); break; } } } /* This is actually called glostat in the Keyspan doco */ static void usa49_instat_callback(struct urb *urb) { int err; unsigned char *data = urb->transfer_buffer; struct keyspan_usa49_portStatusMessage *msg; struct usb_serial *serial; struct usb_serial_port *port; struct keyspan_port_private *p_priv; int old_dcd_state; int status = urb->status; serial = urb->context; if (status) { dev_dbg(&urb->dev->dev, "%s - nonzero status: %d\n", __func__, status); return; } if (urb->actual_length != sizeof(struct keyspan_usa49_portStatusMessage)) { dev_dbg(&urb->dev->dev, "%s - bad length %d\n", __func__, urb->actual_length); goto exit; } msg = (struct keyspan_usa49_portStatusMessage *)data; /* Check port number from message and retrieve private data */ if (msg->portNumber >= serial->num_ports) { dev_dbg(&urb->dev->dev, "%s - Unexpected port number %d\n", __func__, msg->portNumber); goto exit; } port = serial->port[msg->portNumber]; p_priv = usb_get_serial_port_data(port); if (!p_priv) goto resubmit; /* Update handshaking pin state information */ old_dcd_state = p_priv->dcd_state; p_priv->cts_state = ((msg->cts) ? 1 : 0); p_priv->dsr_state = ((msg->dsr) ? 1 : 0); p_priv->dcd_state = ((msg->dcd) ? 1 : 0); p_priv->ri_state = ((msg->ri) ? 1 : 0); if (old_dcd_state != p_priv->dcd_state && old_dcd_state) tty_port_tty_hangup(&port->port, true); resubmit: /* Resubmit urb so we continue receiving */ err = usb_submit_urb(urb, GFP_ATOMIC); if (err != 0) dev_dbg(&port->dev, "%s - resubmit read urb failed. (%d)\n", __func__, err); exit: ; } static void usa49_inack_callback(struct urb *urb) { } static void usa49_indat_callback(struct urb *urb) { int i, err; int endpoint; struct usb_serial_port *port; unsigned char *data = urb->transfer_buffer; int status = urb->status; endpoint = usb_pipeendpoint(urb->pipe); if (status) { dev_dbg(&urb->dev->dev, "%s - nonzero status %d on endpoint %x\n", __func__, status, endpoint); return; } port = urb->context; if (urb->actual_length) { /* 0x80 bit is error flag */ if ((data[0] & 0x80) == 0) { /* no error on any byte */ tty_insert_flip_string(&port->port, data + 1, urb->actual_length - 1); } else { /* some bytes had errors, every byte has status */ for (i = 0; i + 1 < urb->actual_length; i += 2) { int stat = data[i]; int flag = TTY_NORMAL; if (stat & RXERROR_OVERRUN) { tty_insert_flip_char(&port->port, 0, TTY_OVERRUN); } /* XXX should handle break (0x10) */ if (stat & RXERROR_PARITY) flag = TTY_PARITY; else if (stat & RXERROR_FRAMING) flag = TTY_FRAME; tty_insert_flip_char(&port->port, data[i+1], flag); } } tty_flip_buffer_push(&port->port); } /* Resubmit urb so we continue receiving */ err = usb_submit_urb(urb, GFP_ATOMIC); if (err != 0) dev_dbg(&port->dev, "%s - resubmit read urb failed. (%d)\n", __func__, err); } static void usa49wg_indat_callback(struct urb *urb) { int i, len, x, err; struct usb_serial *serial; struct usb_serial_port *port; unsigned char *data = urb->transfer_buffer; int status = urb->status; serial = urb->context; if (status) { dev_dbg(&urb->dev->dev, "%s - nonzero status: %d\n", __func__, status); return; } /* inbound data is in the form P#, len, status, data */ i = 0; len = 0; while (i < urb->actual_length) { /* Check port number from message */ if (data[i] >= serial->num_ports) { dev_dbg(&urb->dev->dev, "%s - Unexpected port number %d\n", __func__, data[i]); return; } port = serial->port[data[i++]]; len = data[i++]; /* 0x80 bit is error flag */ if ((data[i] & 0x80) == 0) { /* no error on any byte */ i++; for (x = 1; x < len && i < urb->actual_length; ++x) tty_insert_flip_char(&port->port, data[i++], 0); } else { /* * some bytes had errors, every byte has status */ for (x = 0; x + 1 < len && i + 1 < urb->actual_length; x += 2) { int stat = data[i]; int flag = TTY_NORMAL; if (stat & RXERROR_OVERRUN) { tty_insert_flip_char(&port->port, 0, TTY_OVERRUN); } /* XXX should handle break (0x10) */ if (stat & RXERROR_PARITY) flag = TTY_PARITY; else if (stat & RXERROR_FRAMING) flag = TTY_FRAME; tty_insert_flip_char(&port->port, data[i+1], flag); i += 2; } } tty_flip_buffer_push(&port->port); } /* Resubmit urb so we continue receiving */ err = usb_submit_urb(urb, GFP_ATOMIC); if (err != 0) dev_dbg(&urb->dev->dev, "%s - resubmit read urb failed. (%d)\n", __func__, err); } /* not used, usa-49 doesn't have per-port control endpoints */ static void usa49_outcont_callback(struct urb *urb) { } static void usa90_indat_callback(struct urb *urb) { int i, err; int endpoint; struct usb_serial_port *port; struct keyspan_port_private *p_priv; unsigned char *data = urb->transfer_buffer; int status = urb->status; endpoint = usb_pipeendpoint(urb->pipe); if (status) { dev_dbg(&urb->dev->dev, "%s - nonzero status %d on endpoint %x\n", __func__, status, endpoint); return; } port = urb->context; p_priv = usb_get_serial_port_data(port); if (urb->actual_length) { /* if current mode is DMA, looks like usa28 format otherwise looks like usa26 data format */ if (p_priv->baud > 57600) tty_insert_flip_string(&port->port, data, urb->actual_length); else { /* 0x80 bit is error flag */ if ((data[0] & 0x80) == 0) { /* no errors on individual bytes, only possible overrun err*/ if (data[0] & RXERROR_OVERRUN) { tty_insert_flip_char(&port->port, 0, TTY_OVERRUN); } for (i = 1; i < urb->actual_length ; ++i) tty_insert_flip_char(&port->port, data[i], TTY_NORMAL); } else { /* some bytes had errors, every byte has status */ dev_dbg(&port->dev, "%s - RX error!!!!\n", __func__); for (i = 0; i + 1 < urb->actual_length; i += 2) { int stat = data[i]; int flag = TTY_NORMAL; if (stat & RXERROR_OVERRUN) { tty_insert_flip_char( &port->port, 0, TTY_OVERRUN); } /* XXX should handle break (0x10) */ if (stat & RXERROR_PARITY) flag = TTY_PARITY; else if (stat & RXERROR_FRAMING) flag = TTY_FRAME; tty_insert_flip_char(&port->port, data[i+1], flag); } } } tty_flip_buffer_push(&port->port); } /* Resubmit urb so we continue receiving */ err = usb_submit_urb(urb, GFP_ATOMIC); if (err != 0) dev_dbg(&port->dev, "%s - resubmit read urb failed. (%d)\n", __func__, err); } static void usa90_instat_callback(struct urb *urb) { unsigned char *data = urb->transfer_buffer; struct keyspan_usa90_portStatusMessage *msg; struct usb_serial *serial; struct usb_serial_port *port; struct keyspan_port_private *p_priv; int old_dcd_state, err; int status = urb->status; serial = urb->context; if (status) { dev_dbg(&urb->dev->dev, "%s - nonzero status: %d\n", __func__, status); return; } if (urb->actual_length < 14) { dev_dbg(&urb->dev->dev, "%s - %d byte report??\n", __func__, urb->actual_length); goto exit; } msg = (struct keyspan_usa90_portStatusMessage *)data; /* Now do something useful with the data */ port = serial->port[0]; p_priv = usb_get_serial_port_data(port); if (!p_priv) goto resubmit; /* Update handshaking pin state information */ old_dcd_state = p_priv->dcd_state; p_priv->cts_state = ((msg->cts) ? 1 : 0); p_priv->dsr_state = ((msg->dsr) ? 1 : 0); p_priv->dcd_state = ((msg->dcd) ? 1 : 0); p_priv->ri_state = ((msg->ri) ? 1 : 0); if (old_dcd_state != p_priv->dcd_state && old_dcd_state) tty_port_tty_hangup(&port->port, true); resubmit: /* Resubmit urb so we continue receiving */ err = usb_submit_urb(urb, GFP_ATOMIC); if (err != 0) dev_dbg(&port->dev, "%s - resubmit read urb failed. (%d)\n", __func__, err); exit: ; } static void usa90_outcont_callback(struct urb *urb) { struct usb_serial_port *port; struct keyspan_port_private *p_priv; port = urb->context; p_priv = usb_get_serial_port_data(port); if (p_priv->resend_cont) { dev_dbg(&urb->dev->dev, "%s - sending setup\n", __func__); keyspan_usa90_send_setup(port->serial, port, p_priv->resend_cont - 1); } } /* Status messages from the 28xg */ static void usa67_instat_callback(struct urb *urb) { int err; unsigned char *data = urb->transfer_buffer; struct keyspan_usa67_portStatusMessage *msg; struct usb_serial *serial; struct usb_serial_port *port; struct keyspan_port_private *p_priv; int old_dcd_state; int status = urb->status; serial = urb->context; if (status) { dev_dbg(&urb->dev->dev, "%s - nonzero status: %d\n", __func__, status); return; } if (urb->actual_length != sizeof(struct keyspan_usa67_portStatusMessage)) { dev_dbg(&urb->dev->dev, "%s - bad length %d\n", __func__, urb->actual_length); return; } /* Now do something useful with the data */ msg = (struct keyspan_usa67_portStatusMessage *)data; /* Check port number from message and retrieve private data */ if (msg->port >= serial->num_ports) { dev_dbg(&urb->dev->dev, "%s - Unexpected port number %d\n", __func__, msg->port); return; } port = serial->port[msg->port]; p_priv = usb_get_serial_port_data(port); if (!p_priv) goto resubmit; /* Update handshaking pin state information */ old_dcd_state = p_priv->dcd_state; p_priv->cts_state = ((msg->hskia_cts) ? 1 : 0); p_priv->dcd_state = ((msg->gpia_dcd) ? 1 : 0); if (old_dcd_state != p_priv->dcd_state && old_dcd_state) tty_port_tty_hangup(&port->port, true); resubmit: /* Resubmit urb so we continue receiving */ err = usb_submit_urb(urb, GFP_ATOMIC); if (err != 0) dev_dbg(&port->dev, "%s - resubmit read urb failed. (%d)\n", __func__, err); } static void usa67_glocont_callback(struct urb *urb) { struct usb_serial *serial; struct usb_serial_port *port; struct keyspan_port_private *p_priv; int i; serial = urb->context; for (i = 0; i < serial->num_ports; ++i) { port = serial->port[i]; p_priv = usb_get_serial_port_data(port); if (!p_priv) continue; if (p_priv->resend_cont) { dev_dbg(&port->dev, "%s - sending setup\n", __func__); keyspan_usa67_send_setup(serial, port, p_priv->resend_cont - 1); break; } } } static unsigned int keyspan_write_room(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct keyspan_port_private *p_priv; const struct keyspan_device_details *d_details; int flip; unsigned int data_len; struct urb *this_urb; p_priv = usb_get_serial_port_data(port); d_details = p_priv->device_details; /* FIXME: locking */ if (d_details->msg_format == msg_usa90) data_len = 64; else data_len = 63; flip = p_priv->out_flip; /* Check both endpoints to see if any are available. */ this_urb = p_priv->out_urbs[flip]; if (this_urb != NULL) { if (this_urb->status != -EINPROGRESS) return data_len; flip = (flip + 1) & d_details->outdat_endp_flip; this_urb = p_priv->out_urbs[flip]; if (this_urb != NULL) { if (this_urb->status != -EINPROGRESS) return data_len; } } return 0; } static int keyspan_open(struct tty_struct *tty, struct usb_serial_port *port) { struct keyspan_port_private *p_priv; const struct keyspan_device_details *d_details; int i, err; int baud_rate, device_port; struct urb *urb; unsigned int cflag = 0; p_priv = usb_get_serial_port_data(port); d_details = p_priv->device_details; /* Set some sane defaults */ p_priv->rts_state = 1; p_priv->dtr_state = 1; p_priv->baud = 9600; /* force baud and lcr to be set on open */ p_priv->old_baud = 0; p_priv->old_cflag = 0; p_priv->out_flip = 0; p_priv->in_flip = 0; /* Reset low level data toggle and start reading from endpoints */ for (i = 0; i < 2; i++) { urb = p_priv->in_urbs[i]; if (urb == NULL) continue; /* make sure endpoint data toggle is synchronized with the device */ usb_clear_halt(urb->dev, urb->pipe); err = usb_submit_urb(urb, GFP_KERNEL); if (err != 0) dev_dbg(&port->dev, "%s - submit urb %d failed (%d)\n", __func__, i, err); } /* Reset low level data toggle on out endpoints */ for (i = 0; i < 2; i++) { urb = p_priv->out_urbs[i]; if (urb == NULL) continue; /* usb_settoggle(urb->dev, usb_pipeendpoint(urb->pipe), usb_pipeout(urb->pipe), 0); */ } /* get the terminal config for the setup message now so we don't * need to send 2 of them */ device_port = port->port_number; if (tty) { cflag = tty->termios.c_cflag; /* Baud rate calculation takes baud rate as an integer so other rates can be generated if desired. */ baud_rate = tty_get_baud_rate(tty); /* If no match or invalid, leave as default */ if (baud_rate >= 0 && d_details->calculate_baud_rate(port, baud_rate, d_details->baudclk, NULL, NULL, NULL, device_port) == KEYSPAN_BAUD_RATE_OK) { p_priv->baud = baud_rate; } } /* set CTS/RTS handshake etc. */ p_priv->cflag = cflag; p_priv->flow_control = (cflag & CRTSCTS) ? flow_cts : flow_none; keyspan_send_setup(port, 1); /* mdelay(100); */ /* keyspan_set_termios(port, NULL); */ return 0; } static void keyspan_dtr_rts(struct usb_serial_port *port, int on) { struct keyspan_port_private *p_priv = usb_get_serial_port_data(port); p_priv->rts_state = on; p_priv->dtr_state = on; keyspan_send_setup(port, 0); } static void keyspan_close(struct usb_serial_port *port) { int i; struct keyspan_port_private *p_priv; p_priv = usb_get_serial_port_data(port); p_priv->rts_state = 0; p_priv->dtr_state = 0; keyspan_send_setup(port, 2); /* pilot-xfer seems to work best with this delay */ mdelay(100); p_priv->out_flip = 0; p_priv->in_flip = 0; usb_kill_urb(p_priv->inack_urb); for (i = 0; i < 2; i++) { usb_kill_urb(p_priv->in_urbs[i]); usb_kill_urb(p_priv->out_urbs[i]); } } /* download the firmware to a pre-renumeration device */ static int keyspan_fake_startup(struct usb_serial *serial) { char *fw_name; dev_dbg(&serial->dev->dev, "Keyspan startup version %04x product %04x\n", le16_to_cpu(serial->dev->descriptor.bcdDevice), le16_to_cpu(serial->dev->descriptor.idProduct)); if ((le16_to_cpu(serial->dev->descriptor.bcdDevice) & 0x8000) != 0x8000) { dev_dbg(&serial->dev->dev, "Firmware already loaded. Quitting.\n"); return 1; } /* Select firmware image on the basis of idProduct */ switch (le16_to_cpu(serial->dev->descriptor.idProduct)) { case keyspan_usa28_pre_product_id: fw_name = "keyspan/usa28.fw"; break; case keyspan_usa28x_pre_product_id: fw_name = "keyspan/usa28x.fw"; break; case keyspan_usa28xa_pre_product_id: fw_name = "keyspan/usa28xa.fw"; break; case keyspan_usa28xb_pre_product_id: fw_name = "keyspan/usa28xb.fw"; break; case keyspan_usa19_pre_product_id: fw_name = "keyspan/usa19.fw"; break; case keyspan_usa19qi_pre_product_id: fw_name = "keyspan/usa19qi.fw"; break; case keyspan_mpr_pre_product_id: fw_name = "keyspan/mpr.fw"; break; case keyspan_usa19qw_pre_product_id: fw_name = "keyspan/usa19qw.fw"; break; case keyspan_usa18x_pre_product_id: fw_name = "keyspan/usa18x.fw"; break; case keyspan_usa19w_pre_product_id: fw_name = "keyspan/usa19w.fw"; break; case keyspan_usa49w_pre_product_id: fw_name = "keyspan/usa49w.fw"; break; case keyspan_usa49wlc_pre_product_id: fw_name = "keyspan/usa49wlc.fw"; break; default: dev_err(&serial->dev->dev, "Unknown product ID (%04x)\n", le16_to_cpu(serial->dev->descriptor.idProduct)); return 1; } dev_dbg(&serial->dev->dev, "Uploading Keyspan %s firmware.\n", fw_name); if (ezusb_fx1_ihex_firmware_download(serial->dev, fw_name) < 0) { dev_err(&serial->dev->dev, "failed to load firmware \"%s\"\n", fw_name); return -ENOENT; } /* after downloading firmware Renumeration will occur in a moment and the new device will bind to the real driver */ /* we don't want this device to have a driver assigned to it. */ return 1; } /* Helper functions used by keyspan_setup_urbs */ static struct usb_endpoint_descriptor const *find_ep(struct usb_serial const *serial, int endpoint) { struct usb_host_interface *iface_desc; struct usb_endpoint_descriptor *ep; int i; iface_desc = serial->interface->cur_altsetting; for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) { ep = &iface_desc->endpoint[i].desc; if (ep->bEndpointAddress == endpoint) return ep; } dev_warn(&serial->interface->dev, "found no endpoint descriptor for endpoint %x\n", endpoint); return NULL; } static struct urb *keyspan_setup_urb(struct usb_serial *serial, int endpoint, int dir, void *ctx, char *buf, int len, void (*callback)(struct urb *)) { struct urb *urb; struct usb_endpoint_descriptor const *ep_desc; char const *ep_type_name; if (endpoint == -1) return NULL; /* endpoint not needed */ dev_dbg(&serial->interface->dev, "%s - alloc for endpoint %x\n", __func__, endpoint); urb = usb_alloc_urb(0, GFP_KERNEL); /* No ISO */ if (!urb) return NULL; if (endpoint == 0) { /* control EP filled in when used */ return urb; } ep_desc = find_ep(serial, endpoint); if (!ep_desc) { usb_free_urb(urb); return NULL; } if (usb_endpoint_xfer_int(ep_desc)) { ep_type_name = "INT"; usb_fill_int_urb(urb, serial->dev, usb_sndintpipe(serial->dev, endpoint) | dir, buf, len, callback, ctx, ep_desc->bInterval); } else if (usb_endpoint_xfer_bulk(ep_desc)) { ep_type_name = "BULK"; usb_fill_bulk_urb(urb, serial->dev, usb_sndbulkpipe(serial->dev, endpoint) | dir, buf, len, callback, ctx); } else { dev_warn(&serial->interface->dev, "unsupported endpoint type %x\n", usb_endpoint_type(ep_desc)); usb_free_urb(urb); return NULL; } dev_dbg(&serial->interface->dev, "%s - using urb %p for %s endpoint %x\n", __func__, urb, ep_type_name, endpoint); return urb; } static struct callbacks { void (*instat_callback)(struct urb *); void (*glocont_callback)(struct urb *); void (*indat_callback)(struct urb *); void (*outdat_callback)(struct urb *); void (*inack_callback)(struct urb *); void (*outcont_callback)(struct urb *); } keyspan_callbacks[] = { { /* msg_usa26 callbacks */ .instat_callback = usa26_instat_callback, .glocont_callback = usa26_glocont_callback, .indat_callback = usa26_indat_callback, .outdat_callback = usa2x_outdat_callback, .inack_callback = usa26_inack_callback, .outcont_callback = usa26_outcont_callback, }, { /* msg_usa28 callbacks */ .instat_callback = usa28_instat_callback, .glocont_callback = usa28_glocont_callback, .indat_callback = usa28_indat_callback, .outdat_callback = usa2x_outdat_callback, .inack_callback = usa28_inack_callback, .outcont_callback = usa28_outcont_callback, }, { /* msg_usa49 callbacks */ .instat_callback = usa49_instat_callback, .glocont_callback = usa49_glocont_callback, .indat_callback = usa49_indat_callback, .outdat_callback = usa2x_outdat_callback, .inack_callback = usa49_inack_callback, .outcont_callback = usa49_outcont_callback, }, { /* msg_usa90 callbacks */ .instat_callback = usa90_instat_callback, .glocont_callback = usa28_glocont_callback, .indat_callback = usa90_indat_callback, .outdat_callback = usa2x_outdat_callback, .inack_callback = usa28_inack_callback, .outcont_callback = usa90_outcont_callback, }, { /* msg_usa67 callbacks */ .instat_callback = usa67_instat_callback, .glocont_callback = usa67_glocont_callback, .indat_callback = usa26_indat_callback, .outdat_callback = usa2x_outdat_callback, .inack_callback = usa26_inack_callback, .outcont_callback = usa26_outcont_callback, } }; /* Generic setup urbs function that uses data in device_details */ static void keyspan_setup_urbs(struct usb_serial *serial) { struct keyspan_serial_private *s_priv; const struct keyspan_device_details *d_details; struct callbacks *cback; s_priv = usb_get_serial_data(serial); d_details = s_priv->device_details; /* Setup values for the various callback routines */ cback = &keyspan_callbacks[d_details->msg_format]; /* Allocate and set up urbs for each one that is in use, starting with instat endpoints */ s_priv->instat_urb = keyspan_setup_urb (serial, d_details->instat_endpoint, USB_DIR_IN, serial, s_priv->instat_buf, INSTAT_BUFLEN, cback->instat_callback); s_priv->indat_urb = keyspan_setup_urb (serial, d_details->indat_endpoint, USB_DIR_IN, serial, s_priv->indat_buf, INDAT49W_BUFLEN, usa49wg_indat_callback); s_priv->glocont_urb = keyspan_setup_urb (serial, d_details->glocont_endpoint, USB_DIR_OUT, serial, s_priv->glocont_buf, GLOCONT_BUFLEN, cback->glocont_callback); } /* usa19 function doesn't require prescaler */ static int keyspan_usa19_calc_baud(struct usb_serial_port *port, u32 baud_rate, u32 baudclk, u8 *rate_hi, u8 *rate_low, u8 *prescaler, int portnum) { u32 b16, /* baud rate times 16 (actual rate used internally) */ div, /* divisor */ cnt; /* inverse of divisor (programmed into 8051) */ dev_dbg(&port->dev, "%s - %d.\n", __func__, baud_rate); /* prevent divide by zero... */ b16 = baud_rate * 16L; if (b16 == 0) return KEYSPAN_INVALID_BAUD_RATE; /* Any "standard" rate over 57k6 is marginal on the USA-19 as we run out of divisor resolution. */ if (baud_rate > 57600) return KEYSPAN_INVALID_BAUD_RATE; /* calculate the divisor and the counter (its inverse) */ div = baudclk / b16; if (div == 0) return KEYSPAN_INVALID_BAUD_RATE; else cnt = 0 - div; if (div > 0xffff) return KEYSPAN_INVALID_BAUD_RATE; /* return the counter values if non-null */ if (rate_low) *rate_low = (u8) (cnt & 0xff); if (rate_hi) *rate_hi = (u8) ((cnt >> 8) & 0xff); if (rate_low && rate_hi) dev_dbg(&port->dev, "%s - %d %02x %02x.\n", __func__, baud_rate, *rate_hi, *rate_low); return KEYSPAN_BAUD_RATE_OK; } /* usa19hs function doesn't require prescaler */ static int keyspan_usa19hs_calc_baud(struct usb_serial_port *port, u32 baud_rate, u32 baudclk, u8 *rate_hi, u8 *rate_low, u8 *prescaler, int portnum) { u32 b16, /* baud rate times 16 (actual rate used internally) */ div; /* divisor */ dev_dbg(&port->dev, "%s - %d.\n", __func__, baud_rate); /* prevent divide by zero... */ b16 = baud_rate * 16L; if (b16 == 0) return KEYSPAN_INVALID_BAUD_RATE; /* calculate the divisor */ div = baudclk / b16; if (div == 0) return KEYSPAN_INVALID_BAUD_RATE; if (div > 0xffff) return KEYSPAN_INVALID_BAUD_RATE; /* return the counter values if non-null */ if (rate_low) *rate_low = (u8) (div & 0xff); if (rate_hi) *rate_hi = (u8) ((div >> 8) & 0xff); if (rate_low && rate_hi) dev_dbg(&port->dev, "%s - %d %02x %02x.\n", __func__, baud_rate, *rate_hi, *rate_low); return KEYSPAN_BAUD_RATE_OK; } static int keyspan_usa19w_calc_baud(struct usb_serial_port *port, u32 baud_rate, u32 baudclk, u8 *rate_hi, u8 *rate_low, u8 *prescaler, int portnum) { u32 b16, /* baud rate times 16 (actual rate used internally) */ clk, /* clock with 13/8 prescaler */ div, /* divisor using 13/8 prescaler */ res, /* resulting baud rate using 13/8 prescaler */ diff, /* error using 13/8 prescaler */ smallest_diff; u8 best_prescaler; int i; dev_dbg(&port->dev, "%s - %d.\n", __func__, baud_rate); /* prevent divide by zero */ b16 = baud_rate * 16L; if (b16 == 0) return KEYSPAN_INVALID_BAUD_RATE; /* Calculate prescaler by trying them all and looking for best fit */ /* start with largest possible difference */ smallest_diff = 0xffffffff; /* 0 is an invalid prescaler, used as a flag */ best_prescaler = 0; for (i = 8; i <= 0xff; ++i) { clk = (baudclk * 8) / (u32) i; div = clk / b16; if (div == 0) continue; res = clk / div; diff = (res > b16) ? (res-b16) : (b16-res); if (diff < smallest_diff) { best_prescaler = i; smallest_diff = diff; } } if (best_prescaler == 0) return KEYSPAN_INVALID_BAUD_RATE; clk = (baudclk * 8) / (u32) best_prescaler; div = clk / b16; /* return the divisor and prescaler if non-null */ if (rate_low) *rate_low = (u8) (div & 0xff); if (rate_hi) *rate_hi = (u8) ((div >> 8) & 0xff); if (prescaler) { *prescaler = best_prescaler; /* dev_dbg(&port->dev, "%s - %d %d\n", __func__, *prescaler, div); */ } return KEYSPAN_BAUD_RATE_OK; } /* USA-28 supports different maximum baud rates on each port */ static int keyspan_usa28_calc_baud(struct usb_serial_port *port, u32 baud_rate, u32 baudclk, u8 *rate_hi, u8 *rate_low, u8 *prescaler, int portnum) { u32 b16, /* baud rate times 16 (actual rate used internally) */ div, /* divisor */ cnt; /* inverse of divisor (programmed into 8051) */ dev_dbg(&port->dev, "%s - %d.\n", __func__, baud_rate); /* prevent divide by zero */ b16 = baud_rate * 16L; if (b16 == 0) return KEYSPAN_INVALID_BAUD_RATE; /* calculate the divisor and the counter (its inverse) */ div = KEYSPAN_USA28_BAUDCLK / b16; if (div == 0) return KEYSPAN_INVALID_BAUD_RATE; else cnt = 0 - div; /* check for out of range, based on portnum, and return result */ if (portnum == 0) { if (div > 0xffff) return KEYSPAN_INVALID_BAUD_RATE; } else { if (portnum == 1) { if (div > 0xff) return KEYSPAN_INVALID_BAUD_RATE; } else return KEYSPAN_INVALID_BAUD_RATE; } /* return the counter values if not NULL (port 1 will ignore retHi) */ if (rate_low) *rate_low = (u8) (cnt & 0xff); if (rate_hi) *rate_hi = (u8) ((cnt >> 8) & 0xff); dev_dbg(&port->dev, "%s - %d OK.\n", __func__, baud_rate); return KEYSPAN_BAUD_RATE_OK; } static int keyspan_usa26_send_setup(struct usb_serial *serial, struct usb_serial_port *port, int reset_port) { struct keyspan_usa26_portControlMessage msg; struct keyspan_serial_private *s_priv; struct keyspan_port_private *p_priv; const struct keyspan_device_details *d_details; struct urb *this_urb; int device_port, err; dev_dbg(&port->dev, "%s reset=%d\n", __func__, reset_port); s_priv = usb_get_serial_data(serial); p_priv = usb_get_serial_port_data(port); d_details = s_priv->device_details; device_port = port->port_number; this_urb = p_priv->outcont_urb; /* Make sure we have an urb then send the message */ if (this_urb == NULL) { dev_dbg(&port->dev, "%s - oops no urb.\n", __func__); return -1; } dev_dbg(&port->dev, "%s - endpoint %x\n", __func__, usb_pipeendpoint(this_urb->pipe)); /* Save reset port val for resend. Don't overwrite resend for open/close condition. */ if ((reset_port + 1) > p_priv->resend_cont) p_priv->resend_cont = reset_port + 1; if (this_urb->status == -EINPROGRESS) { /* dev_dbg(&port->dev, "%s - already writing\n", __func__); */ mdelay(5); return -1; } memset(&msg, 0, sizeof(struct keyspan_usa26_portControlMessage)); /* Only set baud rate if it's changed */ if (p_priv->old_baud != p_priv->baud) { p_priv->old_baud = p_priv->baud; msg.setClocking = 0xff; if (d_details->calculate_baud_rate(port, p_priv->baud, d_details->baudclk, &msg.baudHi, &msg.baudLo, &msg.prescaler, device_port) == KEYSPAN_INVALID_BAUD_RATE) { dev_dbg(&port->dev, "%s - Invalid baud rate %d requested, using 9600.\n", __func__, p_priv->baud); msg.baudLo = 0; msg.baudHi = 125; /* Values for 9600 baud */ msg.prescaler = 10; } msg.setPrescaler = 0xff; } msg.lcr = (p_priv->cflag & CSTOPB) ? STOPBITS_678_2 : STOPBITS_5678_1; switch (p_priv->cflag & CSIZE) { case CS5: msg.lcr |= USA_DATABITS_5; break; case CS6: msg.lcr |= USA_DATABITS_6; break; case CS7: msg.lcr |= USA_DATABITS_7; break; case CS8: msg.lcr |= USA_DATABITS_8; break; } if (p_priv->cflag & PARENB) { /* note USA_PARITY_NONE == 0 */ msg.lcr |= (p_priv->cflag & PARODD) ? USA_PARITY_ODD : USA_PARITY_EVEN; } msg.setLcr = 0xff; msg.ctsFlowControl = (p_priv->flow_control == flow_cts); msg.xonFlowControl = 0; msg.setFlowControl = 0xff; msg.forwardingLength = 16; msg.xonChar = 17; msg.xoffChar = 19; /* Opening port */ if (reset_port == 1) { msg._txOn = 1; msg._txOff = 0; msg.txFlush = 0; msg.txBreak = 0; msg.rxOn = 1; msg.rxOff = 0; msg.rxFlush = 1; msg.rxForward = 0; msg.returnStatus = 0; msg.resetDataToggle = 0xff; } /* Closing port */ else if (reset_port == 2) { msg._txOn = 0; msg._txOff = 1; msg.txFlush = 0; msg.txBreak = 0; msg.rxOn = 0; msg.rxOff = 1; msg.rxFlush = 1; msg.rxForward = 0; msg.returnStatus = 0; msg.resetDataToggle = 0; } /* Sending intermediate configs */ else { msg._txOn = (!p_priv->break_on); msg._txOff = 0; msg.txFlush = 0; msg.txBreak = (p_priv->break_on); msg.rxOn = 0; msg.rxOff = 0; msg.rxFlush = 0; msg.rxForward = 0; msg.returnStatus = 0; msg.resetDataToggle = 0x0; } /* Do handshaking outputs */ msg.setTxTriState_setRts = 0xff; msg.txTriState_rts = p_priv->rts_state; msg.setHskoa_setDtr = 0xff; msg.hskoa_dtr = p_priv->dtr_state; p_priv->resend_cont = 0; memcpy(this_urb->transfer_buffer, &msg, sizeof(msg)); /* send the data out the device on control endpoint */ this_urb->transfer_buffer_length = sizeof(msg); err = usb_submit_urb(this_urb, GFP_ATOMIC); if (err != 0) dev_dbg(&port->dev, "%s - usb_submit_urb(setup) failed (%d)\n", __func__, err); return 0; } static int keyspan_usa28_send_setup(struct usb_serial *serial, struct usb_serial_port *port, int reset_port) { struct keyspan_usa28_portControlMessage msg; struct keyspan_serial_private *s_priv; struct keyspan_port_private *p_priv; const struct keyspan_device_details *d_details; struct urb *this_urb; int device_port, err; s_priv = usb_get_serial_data(serial); p_priv = usb_get_serial_port_data(port); d_details = s_priv->device_details; device_port = port->port_number; /* only do something if we have a bulk out endpoint */ this_urb = p_priv->outcont_urb; if (this_urb == NULL) { dev_dbg(&port->dev, "%s - oops no urb.\n", __func__); return -1; } /* Save reset port val for resend. Don't overwrite resend for open/close condition. */ if ((reset_port + 1) > p_priv->resend_cont) p_priv->resend_cont = reset_port + 1; if (this_urb->status == -EINPROGRESS) { dev_dbg(&port->dev, "%s already writing\n", __func__); mdelay(5); return -1; } memset(&msg, 0, sizeof(struct keyspan_usa28_portControlMessage)); msg.setBaudRate = 1; if (d_details->calculate_baud_rate(port, p_priv->baud, d_details->baudclk, &msg.baudHi, &msg.baudLo, NULL, device_port) == KEYSPAN_INVALID_BAUD_RATE) { dev_dbg(&port->dev, "%s - Invalid baud rate requested %d.\n", __func__, p_priv->baud); msg.baudLo = 0xff; msg.baudHi = 0xb2; /* Values for 9600 baud */ } /* If parity is enabled, we must calculate it ourselves. */ msg.parity = 0; /* XXX for now */ msg.ctsFlowControl = (p_priv->flow_control == flow_cts); msg.xonFlowControl = 0; /* Do handshaking outputs, DTR is inverted relative to RTS */ msg.rts = p_priv->rts_state; msg.dtr = p_priv->dtr_state; msg.forwardingLength = 16; msg.forwardMs = 10; msg.breakThreshold = 45; msg.xonChar = 17; msg.xoffChar = 19; /*msg.returnStatus = 1; msg.resetDataToggle = 0xff;*/ /* Opening port */ if (reset_port == 1) { msg._txOn = 1; msg._txOff = 0; msg.txFlush = 0; msg.txForceXoff = 0; msg.txBreak = 0; msg.rxOn = 1; msg.rxOff = 0; msg.rxFlush = 1; msg.rxForward = 0; msg.returnStatus = 0; msg.resetDataToggle = 0xff; } /* Closing port */ else if (reset_port == 2) { msg._txOn = 0; msg._txOff = 1; msg.txFlush = 0; msg.txForceXoff = 0; msg.txBreak = 0; msg.rxOn = 0; msg.rxOff = 1; msg.rxFlush = 1; msg.rxForward = 0; msg.returnStatus = 0; msg.resetDataToggle = 0; } /* Sending intermediate configs */ else { msg._txOn = (!p_priv->break_on); msg._txOff = 0; msg.txFlush = 0; msg.txForceXoff = 0; msg.txBreak = (p_priv->break_on); msg.rxOn = 0; msg.rxOff = 0; msg.rxFlush = 0; msg.rxForward = 0; msg.returnStatus = 0; msg.resetDataToggle = 0x0; } p_priv->resend_cont = 0; memcpy(this_urb->transfer_buffer, &msg, sizeof(msg)); /* send the data out the device on control endpoint */ this_urb->transfer_buffer_length = sizeof(msg); err = usb_submit_urb(this_urb, GFP_ATOMIC); if (err != 0) dev_dbg(&port->dev, "%s - usb_submit_urb(setup) failed\n", __func__); return 0; } static int keyspan_usa49_send_setup(struct usb_serial *serial, struct usb_serial_port *port, int reset_port) { struct keyspan_usa49_portControlMessage msg; struct usb_ctrlrequest *dr = NULL; struct keyspan_serial_private *s_priv; struct keyspan_port_private *p_priv; const struct keyspan_device_details *d_details; struct urb *this_urb; int err, device_port; s_priv = usb_get_serial_data(serial); p_priv = usb_get_serial_port_data(port); d_details = s_priv->device_details; this_urb = s_priv->glocont_urb; /* Work out which port within the device is being setup */ device_port = port->port_number; /* Make sure we have an urb then send the message */ if (this_urb == NULL) { dev_dbg(&port->dev, "%s - oops no urb for port.\n", __func__); return -1; } dev_dbg(&port->dev, "%s - endpoint %x (%d)\n", __func__, usb_pipeendpoint(this_urb->pipe), device_port); /* Save reset port val for resend. Don't overwrite resend for open/close condition. */ if ((reset_port + 1) > p_priv->resend_cont) p_priv->resend_cont = reset_port + 1; if (this_urb->status == -EINPROGRESS) { /* dev_dbg(&port->dev, "%s - already writing\n", __func__); */ mdelay(5); return -1; } memset(&msg, 0, sizeof(struct keyspan_usa49_portControlMessage)); msg.portNumber = device_port; /* Only set baud rate if it's changed */ if (p_priv->old_baud != p_priv->baud) { p_priv->old_baud = p_priv->baud; msg.setClocking = 0xff; if (d_details->calculate_baud_rate(port, p_priv->baud, d_details->baudclk, &msg.baudHi, &msg.baudLo, &msg.prescaler, device_port) == KEYSPAN_INVALID_BAUD_RATE) { dev_dbg(&port->dev, "%s - Invalid baud rate %d requested, using 9600.\n", __func__, p_priv->baud); msg.baudLo = 0; msg.baudHi = 125; /* Values for 9600 baud */ msg.prescaler = 10; } /* msg.setPrescaler = 0xff; */ } msg.lcr = (p_priv->cflag & CSTOPB) ? STOPBITS_678_2 : STOPBITS_5678_1; switch (p_priv->cflag & CSIZE) { case CS5: msg.lcr |= USA_DATABITS_5; break; case CS6: msg.lcr |= USA_DATABITS_6; break; case CS7: msg.lcr |= USA_DATABITS_7; break; case CS8: msg.lcr |= USA_DATABITS_8; break; } if (p_priv->cflag & PARENB) { /* note USA_PARITY_NONE == 0 */ msg.lcr |= (p_priv->cflag & PARODD) ? USA_PARITY_ODD : USA_PARITY_EVEN; } msg.setLcr = 0xff; msg.ctsFlowControl = (p_priv->flow_control == flow_cts); msg.xonFlowControl = 0; msg.setFlowControl = 0xff; msg.forwardingLength = 16; msg.xonChar = 17; msg.xoffChar = 19; /* Opening port */ if (reset_port == 1) { msg._txOn = 1; msg._txOff = 0; msg.txFlush = 0; msg.txBreak = 0; msg.rxOn = 1; msg.rxOff = 0; msg.rxFlush = 1; msg.rxForward = 0; msg.returnStatus = 0; msg.resetDataToggle = 0xff; msg.enablePort = 1; msg.disablePort = 0; } /* Closing port */ else if (reset_port == 2) { msg._txOn = 0; msg._txOff = 1; msg.txFlush = 0; msg.txBreak = 0; msg.rxOn = 0; msg.rxOff = 1; msg.rxFlush = 1; msg.rxForward = 0; msg.returnStatus = 0; msg.resetDataToggle = 0; msg.enablePort = 0; msg.disablePort = 1; } /* Sending intermediate configs */ else { msg._txOn = (!p_priv->break_on); msg._txOff = 0; msg.txFlush = 0; msg.txBreak = (p_priv->break_on); msg.rxOn = 0; msg.rxOff = 0; msg.rxFlush = 0; msg.rxForward = 0; msg.returnStatus = 0; msg.resetDataToggle = 0x0; msg.enablePort = 0; msg.disablePort = 0; } /* Do handshaking outputs */ msg.setRts = 0xff; msg.rts = p_priv->rts_state; msg.setDtr = 0xff; msg.dtr = p_priv->dtr_state; p_priv->resend_cont = 0; /* if the device is a 49wg, we send control message on usb control EP 0 */ if (d_details->product_id == keyspan_usa49wg_product_id) { dr = (void *)(s_priv->ctrl_buf); dr->bRequestType = USB_TYPE_VENDOR | USB_DIR_OUT; dr->bRequest = 0xB0; /* 49wg control message */ dr->wValue = 0; dr->wIndex = 0; dr->wLength = cpu_to_le16(sizeof(msg)); memcpy(s_priv->glocont_buf, &msg, sizeof(msg)); usb_fill_control_urb(this_urb, serial->dev, usb_sndctrlpipe(serial->dev, 0), (unsigned char *)dr, s_priv->glocont_buf, sizeof(msg), usa49_glocont_callback, serial); } else { memcpy(this_urb->transfer_buffer, &msg, sizeof(msg)); /* send the data out the device on control endpoint */ this_urb->transfer_buffer_length = sizeof(msg); } err = usb_submit_urb(this_urb, GFP_ATOMIC); if (err != 0) dev_dbg(&port->dev, "%s - usb_submit_urb(setup) failed (%d)\n", __func__, err); return 0; } static int keyspan_usa90_send_setup(struct usb_serial *serial, struct usb_serial_port *port, int reset_port) { struct keyspan_usa90_portControlMessage msg; struct keyspan_serial_private *s_priv; struct keyspan_port_private *p_priv; const struct keyspan_device_details *d_details; struct urb *this_urb; int err; u8 prescaler; s_priv = usb_get_serial_data(serial); p_priv = usb_get_serial_port_data(port); d_details = s_priv->device_details; /* only do something if we have a bulk out endpoint */ this_urb = p_priv->outcont_urb; if (this_urb == NULL) { dev_dbg(&port->dev, "%s - oops no urb.\n", __func__); return -1; } /* Save reset port val for resend. Don't overwrite resend for open/close condition. */ if ((reset_port + 1) > p_priv->resend_cont) p_priv->resend_cont = reset_port + 1; if (this_urb->status == -EINPROGRESS) { dev_dbg(&port->dev, "%s already writing\n", __func__); mdelay(5); return -1; } memset(&msg, 0, sizeof(struct keyspan_usa90_portControlMessage)); /* Only set baud rate if it's changed */ if (p_priv->old_baud != p_priv->baud) { p_priv->old_baud = p_priv->baud; msg.setClocking = 0x01; if (d_details->calculate_baud_rate(port, p_priv->baud, d_details->baudclk, &msg.baudHi, &msg.baudLo, &prescaler, 0) == KEYSPAN_INVALID_BAUD_RATE) { dev_dbg(&port->dev, "%s - Invalid baud rate %d requested, using 9600.\n", __func__, p_priv->baud); p_priv->baud = 9600; d_details->calculate_baud_rate(port, p_priv->baud, d_details->baudclk, &msg.baudHi, &msg.baudLo, &prescaler, 0); } msg.setRxMode = 1; msg.setTxMode = 1; } /* modes must always be correctly specified */ if (p_priv->baud > 57600) { msg.rxMode = RXMODE_DMA; msg.txMode = TXMODE_DMA; } else { msg.rxMode = RXMODE_BYHAND; msg.txMode = TXMODE_BYHAND; } msg.lcr = (p_priv->cflag & CSTOPB) ? STOPBITS_678_2 : STOPBITS_5678_1; switch (p_priv->cflag & CSIZE) { case CS5: msg.lcr |= USA_DATABITS_5; break; case CS6: msg.lcr |= USA_DATABITS_6; break; case CS7: msg.lcr |= USA_DATABITS_7; break; case CS8: msg.lcr |= USA_DATABITS_8; break; } if (p_priv->cflag & PARENB) { /* note USA_PARITY_NONE == 0 */ msg.lcr |= (p_priv->cflag & PARODD) ? USA_PARITY_ODD : USA_PARITY_EVEN; } if (p_priv->old_cflag != p_priv->cflag) { p_priv->old_cflag = p_priv->cflag; msg.setLcr = 0x01; } if (p_priv->flow_control == flow_cts) msg.txFlowControl = TXFLOW_CTS; msg.setTxFlowControl = 0x01; msg.setRxFlowControl = 0x01; msg.rxForwardingLength = 16; msg.rxForwardingTimeout = 16; msg.txAckSetting = 0; msg.xonChar = 17; msg.xoffChar = 19; /* Opening port */ if (reset_port == 1) { msg.portEnabled = 1; msg.rxFlush = 1; msg.txBreak = (p_priv->break_on); } /* Closing port */ else if (reset_port == 2) msg.portEnabled = 0; /* Sending intermediate configs */ else { msg.portEnabled = 1; msg.txBreak = (p_priv->break_on); } /* Do handshaking outputs */ msg.setRts = 0x01; msg.rts = p_priv->rts_state; msg.setDtr = 0x01; msg.dtr = p_priv->dtr_state; p_priv->resend_cont = 0; memcpy(this_urb->transfer_buffer, &msg, sizeof(msg)); /* send the data out the device on control endpoint */ this_urb->transfer_buffer_length = sizeof(msg); err = usb_submit_urb(this_urb, GFP_ATOMIC); if (err != 0) dev_dbg(&port->dev, "%s - usb_submit_urb(setup) failed (%d)\n", __func__, err); return 0; } static int keyspan_usa67_send_setup(struct usb_serial *serial, struct usb_serial_port *port, int reset_port) { struct keyspan_usa67_portControlMessage msg; struct keyspan_serial_private *s_priv; struct keyspan_port_private *p_priv; const struct keyspan_device_details *d_details; struct urb *this_urb; int err, device_port; s_priv = usb_get_serial_data(serial); p_priv = usb_get_serial_port_data(port); d_details = s_priv->device_details; this_urb = s_priv->glocont_urb; /* Work out which port within the device is being setup */ device_port = port->port_number; /* Make sure we have an urb then send the message */ if (this_urb == NULL) { dev_dbg(&port->dev, "%s - oops no urb for port.\n", __func__); return -1; } /* Save reset port val for resend. Don't overwrite resend for open/close condition. */ if ((reset_port + 1) > p_priv->resend_cont) p_priv->resend_cont = reset_port + 1; if (this_urb->status == -EINPROGRESS) { /* dev_dbg(&port->dev, "%s - already writing\n", __func__); */ mdelay(5); return -1; } memset(&msg, 0, sizeof(struct keyspan_usa67_portControlMessage)); msg.port = device_port; /* Only set baud rate if it's changed */ if (p_priv->old_baud != p_priv->baud) { p_priv->old_baud = p_priv->baud; msg.setClocking = 0xff; if (d_details->calculate_baud_rate(port, p_priv->baud, d_details->baudclk, &msg.baudHi, &msg.baudLo, &msg.prescaler, device_port) == KEYSPAN_INVALID_BAUD_RATE) { dev_dbg(&port->dev, "%s - Invalid baud rate %d requested, using 9600.\n", __func__, p_priv->baud); msg.baudLo = 0; msg.baudHi = 125; /* Values for 9600 baud */ msg.prescaler = 10; } msg.setPrescaler = 0xff; } msg.lcr = (p_priv->cflag & CSTOPB) ? STOPBITS_678_2 : STOPBITS_5678_1; switch (p_priv->cflag & CSIZE) { case CS5: msg.lcr |= USA_DATABITS_5; break; case CS6: msg.lcr |= USA_DATABITS_6; break; case CS7: msg.lcr |= USA_DATABITS_7; break; case CS8: msg.lcr |= USA_DATABITS_8; break; } if (p_priv->cflag & PARENB) { /* note USA_PARITY_NONE == 0 */ msg.lcr |= (p_priv->cflag & PARODD) ? USA_PARITY_ODD : USA_PARITY_EVEN; } msg.setLcr = 0xff; msg.ctsFlowControl = (p_priv->flow_control == flow_cts); msg.xonFlowControl = 0; msg.setFlowControl = 0xff; msg.forwardingLength = 16; msg.xonChar = 17; msg.xoffChar = 19; if (reset_port == 1) { /* Opening port */ msg._txOn = 1; msg._txOff = 0; msg.txFlush = 0; msg.txBreak = 0; msg.rxOn = 1; msg.rxOff = 0; msg.rxFlush = 1; msg.rxForward = 0; msg.returnStatus = 0; msg.resetDataToggle = 0xff; } else if (reset_port == 2) { /* Closing port */ msg._txOn = 0; msg._txOff = 1; msg.txFlush = 0; msg.txBreak = 0; msg.rxOn = 0; msg.rxOff = 1; msg.rxFlush = 1; msg.rxForward = 0; msg.returnStatus = 0; msg.resetDataToggle = 0; } else { /* Sending intermediate configs */ msg._txOn = (!p_priv->break_on); msg._txOff = 0; msg.txFlush = 0; msg.txBreak = (p_priv->break_on); msg.rxOn = 0; msg.rxOff = 0; msg.rxFlush = 0; msg.rxForward = 0; msg.returnStatus = 0; msg.resetDataToggle = 0x0; } /* Do handshaking outputs */ msg.setTxTriState_setRts = 0xff; msg.txTriState_rts = p_priv->rts_state; msg.setHskoa_setDtr = 0xff; msg.hskoa_dtr = p_priv->dtr_state; p_priv->resend_cont = 0; memcpy(this_urb->transfer_buffer, &msg, sizeof(msg)); /* send the data out the device on control endpoint */ this_urb->transfer_buffer_length = sizeof(msg); err = usb_submit_urb(this_urb, GFP_ATOMIC); if (err != 0) dev_dbg(&port->dev, "%s - usb_submit_urb(setup) failed (%d)\n", __func__, err); return 0; } static void keyspan_send_setup(struct usb_serial_port *port, int reset_port) { struct usb_serial *serial = port->serial; struct keyspan_serial_private *s_priv; const struct keyspan_device_details *d_details; s_priv = usb_get_serial_data(serial); d_details = s_priv->device_details; switch (d_details->msg_format) { case msg_usa26: keyspan_usa26_send_setup(serial, port, reset_port); break; case msg_usa28: keyspan_usa28_send_setup(serial, port, reset_port); break; case msg_usa49: keyspan_usa49_send_setup(serial, port, reset_port); break; case msg_usa90: keyspan_usa90_send_setup(serial, port, reset_port); break; case msg_usa67: keyspan_usa67_send_setup(serial, port, reset_port); break; } } /* Gets called by the "real" driver (ie once firmware is loaded and renumeration has taken place. */ static int keyspan_startup(struct usb_serial *serial) { int i, err; struct keyspan_serial_private *s_priv; const struct keyspan_device_details *d_details; for (i = 0; (d_details = keyspan_devices[i]) != NULL; ++i) if (d_details->product_id == le16_to_cpu(serial->dev->descriptor.idProduct)) break; if (d_details == NULL) { dev_err(&serial->dev->dev, "%s - unknown product id %x\n", __func__, le16_to_cpu(serial->dev->descriptor.idProduct)); return -ENODEV; } /* Setup private data for serial driver */ s_priv = kzalloc(sizeof(struct keyspan_serial_private), GFP_KERNEL); if (!s_priv) return -ENOMEM; s_priv->instat_buf = kzalloc(INSTAT_BUFLEN, GFP_KERNEL); if (!s_priv->instat_buf) goto err_instat_buf; s_priv->indat_buf = kzalloc(INDAT49W_BUFLEN, GFP_KERNEL); if (!s_priv->indat_buf) goto err_indat_buf; s_priv->glocont_buf = kzalloc(GLOCONT_BUFLEN, GFP_KERNEL); if (!s_priv->glocont_buf) goto err_glocont_buf; s_priv->ctrl_buf = kzalloc(sizeof(struct usb_ctrlrequest), GFP_KERNEL); if (!s_priv->ctrl_buf) goto err_ctrl_buf; s_priv->device_details = d_details; usb_set_serial_data(serial, s_priv); keyspan_setup_urbs(serial); if (s_priv->instat_urb != NULL) { err = usb_submit_urb(s_priv->instat_urb, GFP_KERNEL); if (err != 0) dev_dbg(&serial->dev->dev, "%s - submit instat urb failed %d\n", __func__, err); } if (s_priv->indat_urb != NULL) { err = usb_submit_urb(s_priv->indat_urb, GFP_KERNEL); if (err != 0) dev_dbg(&serial->dev->dev, "%s - submit indat urb failed %d\n", __func__, err); } return 0; err_ctrl_buf: kfree(s_priv->glocont_buf); err_glocont_buf: kfree(s_priv->indat_buf); err_indat_buf: kfree(s_priv->instat_buf); err_instat_buf: kfree(s_priv); return -ENOMEM; } static void keyspan_disconnect(struct usb_serial *serial) { struct keyspan_serial_private *s_priv; s_priv = usb_get_serial_data(serial); usb_kill_urb(s_priv->instat_urb); usb_kill_urb(s_priv->glocont_urb); usb_kill_urb(s_priv->indat_urb); } static void keyspan_release(struct usb_serial *serial) { struct keyspan_serial_private *s_priv; s_priv = usb_get_serial_data(serial); /* Make sure to unlink the URBs submitted in attach. */ usb_kill_urb(s_priv->instat_urb); usb_kill_urb(s_priv->indat_urb); usb_free_urb(s_priv->instat_urb); usb_free_urb(s_priv->indat_urb); usb_free_urb(s_priv->glocont_urb); kfree(s_priv->ctrl_buf); kfree(s_priv->glocont_buf); kfree(s_priv->indat_buf); kfree(s_priv->instat_buf); kfree(s_priv); } static int keyspan_port_probe(struct usb_serial_port *port) { struct usb_serial *serial = port->serial; struct keyspan_serial_private *s_priv; struct keyspan_port_private *p_priv; const struct keyspan_device_details *d_details; struct callbacks *cback; int endp; int port_num; int i; s_priv = usb_get_serial_data(serial); d_details = s_priv->device_details; p_priv = kzalloc(sizeof(*p_priv), GFP_KERNEL); if (!p_priv) return -ENOMEM; for (i = 0; i < ARRAY_SIZE(p_priv->in_buffer); ++i) { p_priv->in_buffer[i] = kzalloc(IN_BUFLEN, GFP_KERNEL); if (!p_priv->in_buffer[i]) goto err_free_in_buffer; } for (i = 0; i < ARRAY_SIZE(p_priv->out_buffer); ++i) { p_priv->out_buffer[i] = kzalloc(OUT_BUFLEN, GFP_KERNEL); if (!p_priv->out_buffer[i]) goto err_free_out_buffer; } p_priv->inack_buffer = kzalloc(INACK_BUFLEN, GFP_KERNEL); if (!p_priv->inack_buffer) goto err_free_out_buffer; p_priv->outcont_buffer = kzalloc(OUTCONT_BUFLEN, GFP_KERNEL); if (!p_priv->outcont_buffer) goto err_free_inack_buffer; p_priv->device_details = d_details; /* Setup values for the various callback routines */ cback = &keyspan_callbacks[d_details->msg_format]; port_num = port->port_number; /* Do indat endpoints first, once for each flip */ endp = d_details->indat_endpoints[port_num]; for (i = 0; i <= d_details->indat_endp_flip; ++i, ++endp) { p_priv->in_urbs[i] = keyspan_setup_urb(serial, endp, USB_DIR_IN, port, p_priv->in_buffer[i], IN_BUFLEN, cback->indat_callback); } /* outdat endpoints also have flip */ endp = d_details->outdat_endpoints[port_num]; for (i = 0; i <= d_details->outdat_endp_flip; ++i, ++endp) { p_priv->out_urbs[i] = keyspan_setup_urb(serial, endp, USB_DIR_OUT, port, p_priv->out_buffer[i], OUT_BUFLEN, cback->outdat_callback); } /* inack endpoint */ p_priv->inack_urb = keyspan_setup_urb(serial, d_details->inack_endpoints[port_num], USB_DIR_IN, port, p_priv->inack_buffer, INACK_BUFLEN, cback->inack_callback); /* outcont endpoint */ p_priv->outcont_urb = keyspan_setup_urb(serial, d_details->outcont_endpoints[port_num], USB_DIR_OUT, port, p_priv->outcont_buffer, OUTCONT_BUFLEN, cback->outcont_callback); usb_set_serial_port_data(port, p_priv); return 0; err_free_inack_buffer: kfree(p_priv->inack_buffer); err_free_out_buffer: for (i = 0; i < ARRAY_SIZE(p_priv->out_buffer); ++i) kfree(p_priv->out_buffer[i]); err_free_in_buffer: for (i = 0; i < ARRAY_SIZE(p_priv->in_buffer); ++i) kfree(p_priv->in_buffer[i]); kfree(p_priv); return -ENOMEM; } static void keyspan_port_remove(struct usb_serial_port *port) { struct keyspan_port_private *p_priv; int i; p_priv = usb_get_serial_port_data(port); usb_kill_urb(p_priv->inack_urb); usb_kill_urb(p_priv->outcont_urb); for (i = 0; i < 2; i++) { usb_kill_urb(p_priv->in_urbs[i]); usb_kill_urb(p_priv->out_urbs[i]); } usb_free_urb(p_priv->inack_urb); usb_free_urb(p_priv->outcont_urb); for (i = 0; i < 2; i++) { usb_free_urb(p_priv->in_urbs[i]); usb_free_urb(p_priv->out_urbs[i]); } kfree(p_priv->outcont_buffer); kfree(p_priv->inack_buffer); for (i = 0; i < ARRAY_SIZE(p_priv->out_buffer); ++i) kfree(p_priv->out_buffer[i]); for (i = 0; i < ARRAY_SIZE(p_priv->in_buffer); ++i) kfree(p_priv->in_buffer[i]); kfree(p_priv); } /* Structs for the devices, pre and post renumeration. */ static struct usb_serial_driver keyspan_pre_device = { .driver = { .name = "keyspan_no_firm", }, .description = "Keyspan - (without firmware)", .id_table = keyspan_pre_ids, .num_ports = 1, .attach = keyspan_fake_startup, }; static struct usb_serial_driver keyspan_1port_device = { .driver = { .name = "keyspan_1", }, .description = "Keyspan 1 port adapter", .id_table = keyspan_1port_ids, .num_ports = 1, .open = keyspan_open, .close = keyspan_close, .dtr_rts = keyspan_dtr_rts, .write = keyspan_write, .write_room = keyspan_write_room, .set_termios = keyspan_set_termios, .break_ctl = keyspan_break_ctl, .tiocmget = keyspan_tiocmget, .tiocmset = keyspan_tiocmset, .attach = keyspan_startup, .disconnect = keyspan_disconnect, .release = keyspan_release, .port_probe = keyspan_port_probe, .port_remove = keyspan_port_remove, }; static struct usb_serial_driver keyspan_2port_device = { .driver = { .name = "keyspan_2", }, .description = "Keyspan 2 port adapter", .id_table = keyspan_2port_ids, .num_ports = 2, .open = keyspan_open, .close = keyspan_close, .dtr_rts = keyspan_dtr_rts, .write = keyspan_write, .write_room = keyspan_write_room, .set_termios = keyspan_set_termios, .break_ctl = keyspan_break_ctl, .tiocmget = keyspan_tiocmget, .tiocmset = keyspan_tiocmset, .attach = keyspan_startup, .disconnect = keyspan_disconnect, .release = keyspan_release, .port_probe = keyspan_port_probe, .port_remove = keyspan_port_remove, }; static struct usb_serial_driver keyspan_4port_device = { .driver = { .name = "keyspan_4", }, .description = "Keyspan 4 port adapter", .id_table = keyspan_4port_ids, .num_ports = 4, .open = keyspan_open, .close = keyspan_close, .dtr_rts = keyspan_dtr_rts, .write = keyspan_write, .write_room = keyspan_write_room, .set_termios = keyspan_set_termios, .break_ctl = keyspan_break_ctl, .tiocmget = keyspan_tiocmget, .tiocmset = keyspan_tiocmset, .attach = keyspan_startup, .disconnect = keyspan_disconnect, .release = keyspan_release, .port_probe = keyspan_port_probe, .port_remove = keyspan_port_remove, }; static struct usb_serial_driver * const serial_drivers[] = { &keyspan_pre_device, &keyspan_1port_device, &keyspan_2port_device, &keyspan_4port_device, NULL }; module_usb_serial_driver(serial_drivers, keyspan_ids_combined); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL"); MODULE_FIRMWARE("keyspan/usa28.fw"); MODULE_FIRMWARE("keyspan/usa28x.fw"); MODULE_FIRMWARE("keyspan/usa28xa.fw"); MODULE_FIRMWARE("keyspan/usa28xb.fw"); MODULE_FIRMWARE("keyspan/usa19.fw"); MODULE_FIRMWARE("keyspan/usa19qi.fw"); MODULE_FIRMWARE("keyspan/mpr.fw"); MODULE_FIRMWARE("keyspan/usa19qw.fw"); MODULE_FIRMWARE("keyspan/usa18x.fw"); MODULE_FIRMWARE("keyspan/usa19w.fw"); MODULE_FIRMWARE("keyspan/usa49w.fw"); MODULE_FIRMWARE("keyspan/usa49wlc.fw"); |
| 10 10 2 2 2 10 5 5 5 5 18 19 5 17 16 16 16 19 19 19 19 19 5 1 5 5 5 8 8 8 8 7 7 7 3 3 3 1 2 2 2 2 2 1 7 7 9 9 8 8 1 1 1 7 1 9 7 1 2 1 1 1 1 2 14 14 14 14 14 14 14 14 14 1 1 1 13 13 13 13 13 13 5 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2011 Intel Corporation. All rights reserved. * Copyright (C) 2014 Marvell International Ltd. */ #define pr_fmt(fmt) "llcp: %s: " fmt, __func__ #include <linux/init.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/nfc.h> #include "nfc.h" #include "llcp.h" static u8 llcp_magic[3] = {0x46, 0x66, 0x6d}; static LIST_HEAD(llcp_devices); /* Protects llcp_devices list */ static DEFINE_SPINLOCK(llcp_devices_lock); static void nfc_llcp_rx_skb(struct nfc_llcp_local *local, struct sk_buff *skb); void nfc_llcp_sock_link(struct llcp_sock_list *l, struct sock *sk) { write_lock(&l->lock); sk_add_node(sk, &l->head); write_unlock(&l->lock); } void nfc_llcp_sock_unlink(struct llcp_sock_list *l, struct sock *sk) { write_lock(&l->lock); sk_del_node_init(sk); write_unlock(&l->lock); } void nfc_llcp_socket_remote_param_init(struct nfc_llcp_sock *sock) { sock->remote_rw = LLCP_DEFAULT_RW; sock->remote_miu = LLCP_MAX_MIU + 1; } static void nfc_llcp_socket_purge(struct nfc_llcp_sock *sock) { struct nfc_llcp_local *local = sock->local; struct sk_buff *s, *tmp; skb_queue_purge(&sock->tx_queue); skb_queue_purge(&sock->tx_pending_queue); if (local == NULL) return; /* Search for local pending SKBs that are related to this socket */ skb_queue_walk_safe(&local->tx_queue, s, tmp) { if (s->sk != &sock->sk) continue; skb_unlink(s, &local->tx_queue); kfree_skb(s); } } static void nfc_llcp_socket_release(struct nfc_llcp_local *local, bool device, int err) { struct sock *sk; struct hlist_node *tmp; struct nfc_llcp_sock *llcp_sock; skb_queue_purge(&local->tx_queue); write_lock(&local->sockets.lock); sk_for_each_safe(sk, tmp, &local->sockets.head) { llcp_sock = nfc_llcp_sock(sk); bh_lock_sock(sk); nfc_llcp_socket_purge(llcp_sock); if (sk->sk_state == LLCP_CONNECTED) nfc_put_device(llcp_sock->dev); if (sk->sk_state == LLCP_LISTEN) { struct nfc_llcp_sock *lsk, *n; struct sock *accept_sk; list_for_each_entry_safe(lsk, n, &llcp_sock->accept_queue, accept_queue) { accept_sk = &lsk->sk; bh_lock_sock(accept_sk); nfc_llcp_accept_unlink(accept_sk); if (err) accept_sk->sk_err = err; accept_sk->sk_state = LLCP_CLOSED; accept_sk->sk_state_change(sk); bh_unlock_sock(accept_sk); } } if (err) sk->sk_err = err; sk->sk_state = LLCP_CLOSED; sk->sk_state_change(sk); bh_unlock_sock(sk); sk_del_node_init(sk); } write_unlock(&local->sockets.lock); /* If we still have a device, we keep the RAW sockets alive */ if (device == true) return; write_lock(&local->raw_sockets.lock); sk_for_each_safe(sk, tmp, &local->raw_sockets.head) { llcp_sock = nfc_llcp_sock(sk); bh_lock_sock(sk); nfc_llcp_socket_purge(llcp_sock); if (err) sk->sk_err = err; sk->sk_state = LLCP_CLOSED; sk->sk_state_change(sk); bh_unlock_sock(sk); sk_del_node_init(sk); } write_unlock(&local->raw_sockets.lock); } static struct nfc_llcp_local *nfc_llcp_local_get(struct nfc_llcp_local *local) { /* Since using nfc_llcp_local may result in usage of nfc_dev, whenever * we hold a reference to local, we also need to hold a reference to * the device to avoid UAF. */ if (!nfc_get_device(local->dev->idx)) return NULL; kref_get(&local->ref); return local; } static void local_cleanup(struct nfc_llcp_local *local) { nfc_llcp_socket_release(local, false, ENXIO); timer_delete_sync(&local->link_timer); skb_queue_purge(&local->tx_queue); cancel_work_sync(&local->tx_work); cancel_work_sync(&local->rx_work); cancel_work_sync(&local->timeout_work); kfree_skb(local->rx_pending); local->rx_pending = NULL; timer_delete_sync(&local->sdreq_timer); cancel_work_sync(&local->sdreq_timeout_work); nfc_llcp_free_sdp_tlv_list(&local->pending_sdreqs); } static void local_release(struct kref *ref) { struct nfc_llcp_local *local; local = container_of(ref, struct nfc_llcp_local, ref); local_cleanup(local); kfree(local); } int nfc_llcp_local_put(struct nfc_llcp_local *local) { struct nfc_dev *dev; int ret; if (local == NULL) return 0; dev = local->dev; ret = kref_put(&local->ref, local_release); nfc_put_device(dev); return ret; } static struct nfc_llcp_sock *nfc_llcp_sock_get(struct nfc_llcp_local *local, u8 ssap, u8 dsap) { struct sock *sk; struct nfc_llcp_sock *llcp_sock, *tmp_sock; pr_debug("ssap dsap %d %d\n", ssap, dsap); if (ssap == 0 && dsap == 0) return NULL; read_lock(&local->sockets.lock); llcp_sock = NULL; sk_for_each(sk, &local->sockets.head) { tmp_sock = nfc_llcp_sock(sk); if (tmp_sock->ssap == ssap && tmp_sock->dsap == dsap) { llcp_sock = tmp_sock; sock_hold(&llcp_sock->sk); break; } } read_unlock(&local->sockets.lock); return llcp_sock; } static void nfc_llcp_sock_put(struct nfc_llcp_sock *sock) { sock_put(&sock->sk); } static void nfc_llcp_timeout_work(struct work_struct *work) { struct nfc_llcp_local *local = container_of(work, struct nfc_llcp_local, timeout_work); nfc_dep_link_down(local->dev); } static void nfc_llcp_symm_timer(struct timer_list *t) { struct nfc_llcp_local *local = timer_container_of(local, t, link_timer); pr_err("SYMM timeout\n"); schedule_work(&local->timeout_work); } static void nfc_llcp_sdreq_timeout_work(struct work_struct *work) { unsigned long time; HLIST_HEAD(nl_sdres_list); struct hlist_node *n; struct nfc_llcp_sdp_tlv *sdp; struct nfc_llcp_local *local = container_of(work, struct nfc_llcp_local, sdreq_timeout_work); mutex_lock(&local->sdreq_lock); time = jiffies - msecs_to_jiffies(3 * local->remote_lto); hlist_for_each_entry_safe(sdp, n, &local->pending_sdreqs, node) { if (time_after(sdp->time, time)) continue; sdp->sap = LLCP_SDP_UNBOUND; hlist_del(&sdp->node); hlist_add_head(&sdp->node, &nl_sdres_list); } if (!hlist_empty(&local->pending_sdreqs)) mod_timer(&local->sdreq_timer, jiffies + msecs_to_jiffies(3 * local->remote_lto)); mutex_unlock(&local->sdreq_lock); if (!hlist_empty(&nl_sdres_list)) nfc_genl_llc_send_sdres(local->dev, &nl_sdres_list); } static void nfc_llcp_sdreq_timer(struct timer_list *t) { struct nfc_llcp_local *local = timer_container_of(local, t, sdreq_timer); schedule_work(&local->sdreq_timeout_work); } struct nfc_llcp_local *nfc_llcp_find_local(struct nfc_dev *dev) { struct nfc_llcp_local *local; struct nfc_llcp_local *res = NULL; spin_lock(&llcp_devices_lock); list_for_each_entry(local, &llcp_devices, list) if (local->dev == dev) { res = nfc_llcp_local_get(local); break; } spin_unlock(&llcp_devices_lock); return res; } static struct nfc_llcp_local *nfc_llcp_remove_local(struct nfc_dev *dev) { struct nfc_llcp_local *local, *tmp; spin_lock(&llcp_devices_lock); list_for_each_entry_safe(local, tmp, &llcp_devices, list) if (local->dev == dev) { spin_lock(&local->tx_queue.lock); list_del_init(&local->list); spin_unlock(&local->tx_queue.lock); spin_unlock(&llcp_devices_lock); return local; } spin_unlock(&llcp_devices_lock); pr_warn("Shutting down device not found\n"); return NULL; } static char *wks[] = { NULL, NULL, /* SDP */ "urn:nfc:sn:ip", "urn:nfc:sn:obex", "urn:nfc:sn:snep", }; static int nfc_llcp_wks_sap(const char *service_name, size_t service_name_len) { int sap, num_wks; pr_debug("%s\n", service_name); if (service_name == NULL) return -EINVAL; num_wks = ARRAY_SIZE(wks); for (sap = 0; sap < num_wks; sap++) { if (wks[sap] == NULL) continue; if (strncmp(wks[sap], service_name, service_name_len) == 0) return sap; } return -EINVAL; } static struct nfc_llcp_sock *nfc_llcp_sock_from_sn(struct nfc_llcp_local *local, const u8 *sn, size_t sn_len, bool needref) { struct sock *sk; struct nfc_llcp_sock *llcp_sock, *tmp_sock; pr_debug("sn %zd %p\n", sn_len, sn); if (sn == NULL || sn_len == 0) return NULL; read_lock(&local->sockets.lock); llcp_sock = NULL; sk_for_each(sk, &local->sockets.head) { tmp_sock = nfc_llcp_sock(sk); pr_debug("llcp sock %p\n", tmp_sock); if (tmp_sock->sk.sk_type == SOCK_STREAM && tmp_sock->sk.sk_state != LLCP_LISTEN) continue; if (tmp_sock->sk.sk_type == SOCK_DGRAM && tmp_sock->sk.sk_state != LLCP_BOUND) continue; if (tmp_sock->service_name == NULL || tmp_sock->service_name_len == 0) continue; if (tmp_sock->service_name_len != sn_len) continue; if (memcmp(sn, tmp_sock->service_name, sn_len) == 0) { llcp_sock = tmp_sock; if (needref) sock_hold(&llcp_sock->sk); break; } } read_unlock(&local->sockets.lock); pr_debug("Found llcp sock %p\n", llcp_sock); return llcp_sock; } u8 nfc_llcp_get_sdp_ssap(struct nfc_llcp_local *local, struct nfc_llcp_sock *sock) { mutex_lock(&local->sdp_lock); if (sock->service_name != NULL && sock->service_name_len > 0) { int ssap = nfc_llcp_wks_sap(sock->service_name, sock->service_name_len); if (ssap > 0) { pr_debug("WKS %d\n", ssap); /* This is a WKS, let's check if it's free */ if (test_bit(ssap, &local->local_wks)) { mutex_unlock(&local->sdp_lock); return LLCP_SAP_MAX; } set_bit(ssap, &local->local_wks); mutex_unlock(&local->sdp_lock); return ssap; } /* * Check if there already is a non WKS socket bound * to this service name. */ if (nfc_llcp_sock_from_sn(local, sock->service_name, sock->service_name_len, false) != NULL) { mutex_unlock(&local->sdp_lock); return LLCP_SAP_MAX; } mutex_unlock(&local->sdp_lock); return LLCP_SDP_UNBOUND; } else if (sock->ssap != 0 && sock->ssap < LLCP_WKS_NUM_SAP) { if (!test_bit(sock->ssap, &local->local_wks)) { set_bit(sock->ssap, &local->local_wks); mutex_unlock(&local->sdp_lock); return sock->ssap; } } mutex_unlock(&local->sdp_lock); return LLCP_SAP_MAX; } u8 nfc_llcp_get_local_ssap(struct nfc_llcp_local *local) { u8 local_ssap; mutex_lock(&local->sdp_lock); local_ssap = find_first_zero_bit(&local->local_sap, LLCP_LOCAL_NUM_SAP); if (local_ssap == LLCP_LOCAL_NUM_SAP) { mutex_unlock(&local->sdp_lock); return LLCP_SAP_MAX; } set_bit(local_ssap, &local->local_sap); mutex_unlock(&local->sdp_lock); return local_ssap + LLCP_LOCAL_SAP_OFFSET; } void nfc_llcp_put_ssap(struct nfc_llcp_local *local, u8 ssap) { u8 local_ssap; unsigned long *sdp; if (ssap < LLCP_WKS_NUM_SAP) { local_ssap = ssap; sdp = &local->local_wks; } else if (ssap < LLCP_LOCAL_NUM_SAP) { atomic_t *client_cnt; local_ssap = ssap - LLCP_WKS_NUM_SAP; sdp = &local->local_sdp; client_cnt = &local->local_sdp_cnt[local_ssap]; pr_debug("%d clients\n", atomic_read(client_cnt)); mutex_lock(&local->sdp_lock); if (atomic_dec_and_test(client_cnt)) { struct nfc_llcp_sock *l_sock; pr_debug("No more clients for SAP %d\n", ssap); clear_bit(local_ssap, sdp); /* Find the listening sock and set it back to UNBOUND */ l_sock = nfc_llcp_sock_get(local, ssap, LLCP_SAP_SDP); if (l_sock) { l_sock->ssap = LLCP_SDP_UNBOUND; nfc_llcp_sock_put(l_sock); } } mutex_unlock(&local->sdp_lock); return; } else if (ssap < LLCP_MAX_SAP) { local_ssap = ssap - LLCP_LOCAL_NUM_SAP; sdp = &local->local_sap; } else { return; } mutex_lock(&local->sdp_lock); clear_bit(local_ssap, sdp); mutex_unlock(&local->sdp_lock); } static u8 nfc_llcp_reserve_sdp_ssap(struct nfc_llcp_local *local) { u8 ssap; mutex_lock(&local->sdp_lock); ssap = find_first_zero_bit(&local->local_sdp, LLCP_SDP_NUM_SAP); if (ssap == LLCP_SDP_NUM_SAP) { mutex_unlock(&local->sdp_lock); return LLCP_SAP_MAX; } pr_debug("SDP ssap %d\n", LLCP_WKS_NUM_SAP + ssap); set_bit(ssap, &local->local_sdp); mutex_unlock(&local->sdp_lock); return LLCP_WKS_NUM_SAP + ssap; } static int nfc_llcp_build_gb(struct nfc_llcp_local *local) { u8 *gb_cur, version, version_length; u8 lto_length, wks_length, miux_length; const u8 *version_tlv = NULL, *lto_tlv = NULL, *wks_tlv = NULL, *miux_tlv = NULL; __be16 wks = cpu_to_be16(local->local_wks); u8 gb_len = 0; int ret = 0; version = LLCP_VERSION_11; version_tlv = nfc_llcp_build_tlv(LLCP_TLV_VERSION, &version, 1, &version_length); if (!version_tlv) { ret = -ENOMEM; goto out; } gb_len += version_length; lto_tlv = nfc_llcp_build_tlv(LLCP_TLV_LTO, &local->lto, 1, <o_length); if (!lto_tlv) { ret = -ENOMEM; goto out; } gb_len += lto_length; pr_debug("Local wks 0x%lx\n", local->local_wks); wks_tlv = nfc_llcp_build_tlv(LLCP_TLV_WKS, (u8 *)&wks, 2, &wks_length); if (!wks_tlv) { ret = -ENOMEM; goto out; } gb_len += wks_length; miux_tlv = nfc_llcp_build_tlv(LLCP_TLV_MIUX, (u8 *)&local->miux, 0, &miux_length); if (!miux_tlv) { ret = -ENOMEM; goto out; } gb_len += miux_length; gb_len += ARRAY_SIZE(llcp_magic); if (gb_len > NFC_MAX_GT_LEN) { ret = -EINVAL; goto out; } gb_cur = local->gb; memcpy(gb_cur, llcp_magic, ARRAY_SIZE(llcp_magic)); gb_cur += ARRAY_SIZE(llcp_magic); memcpy(gb_cur, version_tlv, version_length); gb_cur += version_length; memcpy(gb_cur, lto_tlv, lto_length); gb_cur += lto_length; memcpy(gb_cur, wks_tlv, wks_length); gb_cur += wks_length; memcpy(gb_cur, miux_tlv, miux_length); gb_cur += miux_length; local->gb_len = gb_len; out: kfree(version_tlv); kfree(lto_tlv); kfree(wks_tlv); kfree(miux_tlv); return ret; } u8 *nfc_llcp_general_bytes(struct nfc_dev *dev, size_t *general_bytes_len) { struct nfc_llcp_local *local; local = nfc_llcp_find_local(dev); if (local == NULL) { *general_bytes_len = 0; return NULL; } nfc_llcp_build_gb(local); *general_bytes_len = local->gb_len; nfc_llcp_local_put(local); return local->gb; } int nfc_llcp_set_remote_gb(struct nfc_dev *dev, const u8 *gb, u8 gb_len) { struct nfc_llcp_local *local; int err; if (gb_len < 3 || gb_len > NFC_MAX_GT_LEN) return -EINVAL; local = nfc_llcp_find_local(dev); if (local == NULL) { pr_err("No LLCP device\n"); return -ENODEV; } memset(local->remote_gb, 0, NFC_MAX_GT_LEN); memcpy(local->remote_gb, gb, gb_len); local->remote_gb_len = gb_len; if (memcmp(local->remote_gb, llcp_magic, 3)) { pr_err("MAC does not support LLCP\n"); err = -EINVAL; goto out; } err = nfc_llcp_parse_gb_tlv(local, &local->remote_gb[3], local->remote_gb_len - 3); out: nfc_llcp_local_put(local); return err; } static u8 nfc_llcp_dsap(const struct sk_buff *pdu) { return (pdu->data[0] & 0xfc) >> 2; } static u8 nfc_llcp_ptype(const struct sk_buff *pdu) { return ((pdu->data[0] & 0x03) << 2) | ((pdu->data[1] & 0xc0) >> 6); } static u8 nfc_llcp_ssap(const struct sk_buff *pdu) { return pdu->data[1] & 0x3f; } static u8 nfc_llcp_ns(const struct sk_buff *pdu) { return pdu->data[2] >> 4; } static u8 nfc_llcp_nr(const struct sk_buff *pdu) { return pdu->data[2] & 0xf; } static void nfc_llcp_set_nrns(struct nfc_llcp_sock *sock, struct sk_buff *pdu) { pdu->data[2] = (sock->send_n << 4) | (sock->recv_n); sock->send_n = (sock->send_n + 1) % 16; sock->recv_ack_n = (sock->recv_n - 1) % 16; } void nfc_llcp_send_to_raw_sock(struct nfc_llcp_local *local, struct sk_buff *skb, u8 direction) { struct sk_buff *skb_copy = NULL, *nskb; struct sock *sk; u8 *data; read_lock(&local->raw_sockets.lock); sk_for_each(sk, &local->raw_sockets.head) { if (sk->sk_state != LLCP_BOUND) continue; if (skb_copy == NULL) { skb_copy = __pskb_copy_fclone(skb, NFC_RAW_HEADER_SIZE, GFP_ATOMIC, true); if (skb_copy == NULL) continue; data = skb_push(skb_copy, NFC_RAW_HEADER_SIZE); data[0] = local->dev ? local->dev->idx : 0xFF; data[1] = direction & 0x01; data[1] |= (RAW_PAYLOAD_LLCP << 1); } nskb = skb_clone(skb_copy, GFP_ATOMIC); if (!nskb) continue; if (sock_queue_rcv_skb(sk, nskb)) kfree_skb(nskb); } read_unlock(&local->raw_sockets.lock); kfree_skb(skb_copy); } static void nfc_llcp_tx_work(struct work_struct *work) { struct nfc_llcp_local *local = container_of(work, struct nfc_llcp_local, tx_work); struct sk_buff *skb; struct sock *sk; struct nfc_llcp_sock *llcp_sock; skb = skb_dequeue(&local->tx_queue); if (skb != NULL) { sk = skb->sk; llcp_sock = nfc_llcp_sock(sk); if (llcp_sock == NULL && nfc_llcp_ptype(skb) == LLCP_PDU_I) { kfree_skb(skb); nfc_llcp_send_symm(local->dev); } else if (llcp_sock && !llcp_sock->remote_ready) { skb_queue_head(&local->tx_queue, skb); nfc_llcp_send_symm(local->dev); } else { struct sk_buff *copy_skb = NULL; u8 ptype = nfc_llcp_ptype(skb); int ret; pr_debug("Sending pending skb\n"); print_hex_dump_debug("LLCP Tx: ", DUMP_PREFIX_OFFSET, 16, 1, skb->data, skb->len, true); if (ptype == LLCP_PDU_I) copy_skb = skb_copy(skb, GFP_ATOMIC); __net_timestamp(skb); nfc_llcp_send_to_raw_sock(local, skb, NFC_DIRECTION_TX); ret = nfc_data_exchange(local->dev, local->target_idx, skb, nfc_llcp_recv, local); if (ret) { kfree_skb(copy_skb); goto out; } if (ptype == LLCP_PDU_I && copy_skb) skb_queue_tail(&llcp_sock->tx_pending_queue, copy_skb); } } else { nfc_llcp_send_symm(local->dev); } out: mod_timer(&local->link_timer, jiffies + msecs_to_jiffies(2 * local->remote_lto)); } static struct nfc_llcp_sock *nfc_llcp_connecting_sock_get(struct nfc_llcp_local *local, u8 ssap) { struct sock *sk; struct nfc_llcp_sock *llcp_sock; read_lock(&local->connecting_sockets.lock); sk_for_each(sk, &local->connecting_sockets.head) { llcp_sock = nfc_llcp_sock(sk); if (llcp_sock->ssap == ssap) { sock_hold(&llcp_sock->sk); goto out; } } llcp_sock = NULL; out: read_unlock(&local->connecting_sockets.lock); return llcp_sock; } static struct nfc_llcp_sock *nfc_llcp_sock_get_sn(struct nfc_llcp_local *local, const u8 *sn, size_t sn_len) { return nfc_llcp_sock_from_sn(local, sn, sn_len, true); } static const u8 *nfc_llcp_connect_sn(const struct sk_buff *skb, size_t *sn_len) { u8 type, length; const u8 *tlv = &skb->data[2]; size_t tlv_array_len = skb->len - LLCP_HEADER_SIZE, offset = 0; while (offset < tlv_array_len) { type = tlv[0]; length = tlv[1]; pr_debug("type 0x%x length %d\n", type, length); if (type == LLCP_TLV_SN) { *sn_len = length; return &tlv[2]; } offset += length + 2; tlv += length + 2; } return NULL; } static void nfc_llcp_recv_ui(struct nfc_llcp_local *local, struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct nfc_llcp_ui_cb *ui_cb; u8 dsap, ssap; dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); ui_cb = nfc_llcp_ui_skb_cb(skb); ui_cb->dsap = dsap; ui_cb->ssap = ssap; pr_debug("%d %d\n", dsap, ssap); /* We're looking for a bound socket, not a client one */ llcp_sock = nfc_llcp_sock_get(local, dsap, LLCP_SAP_SDP); if (llcp_sock == NULL || llcp_sock->sk.sk_type != SOCK_DGRAM) return; /* There is no sequence with UI frames */ skb_pull(skb, LLCP_HEADER_SIZE); if (!sock_queue_rcv_skb(&llcp_sock->sk, skb)) { /* * UI frames will be freed from the socket layer, so we * need to keep them alive until someone receives them. */ skb_get(skb); } else { pr_err("Receive queue is full\n"); } nfc_llcp_sock_put(llcp_sock); } static void nfc_llcp_recv_connect(struct nfc_llcp_local *local, const struct sk_buff *skb) { struct sock *new_sk, *parent; struct nfc_llcp_sock *sock, *new_sock; u8 dsap, ssap, reason; dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); pr_debug("%d %d\n", dsap, ssap); if (dsap != LLCP_SAP_SDP) { sock = nfc_llcp_sock_get(local, dsap, LLCP_SAP_SDP); if (sock == NULL || sock->sk.sk_state != LLCP_LISTEN) { reason = LLCP_DM_NOBOUND; goto fail; } } else { const u8 *sn; size_t sn_len; sn = nfc_llcp_connect_sn(skb, &sn_len); if (sn == NULL) { reason = LLCP_DM_NOBOUND; goto fail; } pr_debug("Service name length %zu\n", sn_len); sock = nfc_llcp_sock_get_sn(local, sn, sn_len); if (sock == NULL) { reason = LLCP_DM_NOBOUND; goto fail; } } lock_sock(&sock->sk); parent = &sock->sk; if (sk_acceptq_is_full(parent)) { reason = LLCP_DM_REJ; release_sock(&sock->sk); sock_put(&sock->sk); goto fail; } if (sock->ssap == LLCP_SDP_UNBOUND) { u8 ssap = nfc_llcp_reserve_sdp_ssap(local); pr_debug("First client, reserving %d\n", ssap); if (ssap == LLCP_SAP_MAX) { reason = LLCP_DM_REJ; release_sock(&sock->sk); sock_put(&sock->sk); goto fail; } sock->ssap = ssap; } new_sk = nfc_llcp_sock_alloc(NULL, parent->sk_type, GFP_ATOMIC, 0); if (new_sk == NULL) { reason = LLCP_DM_REJ; release_sock(&sock->sk); sock_put(&sock->sk); goto fail; } new_sock = nfc_llcp_sock(new_sk); new_sock->local = nfc_llcp_local_get(local); if (!new_sock->local) { reason = LLCP_DM_REJ; sock_put(&new_sock->sk); release_sock(&sock->sk); sock_put(&sock->sk); goto fail; } new_sock->dev = local->dev; new_sock->rw = sock->rw; new_sock->miux = sock->miux; new_sock->nfc_protocol = sock->nfc_protocol; new_sock->dsap = ssap; new_sock->target_idx = local->target_idx; new_sock->parent = parent; new_sock->ssap = sock->ssap; if (sock->ssap < LLCP_LOCAL_NUM_SAP && sock->ssap >= LLCP_WKS_NUM_SAP) { atomic_t *client_count; pr_debug("reserved_ssap %d for %p\n", sock->ssap, new_sock); client_count = &local->local_sdp_cnt[sock->ssap - LLCP_WKS_NUM_SAP]; atomic_inc(client_count); new_sock->reserved_ssap = sock->ssap; } nfc_llcp_parse_connection_tlv(new_sock, &skb->data[LLCP_HEADER_SIZE], skb->len - LLCP_HEADER_SIZE); pr_debug("new sock %p sk %p\n", new_sock, &new_sock->sk); nfc_llcp_sock_link(&local->sockets, new_sk); nfc_llcp_accept_enqueue(&sock->sk, new_sk); nfc_get_device(local->dev->idx); new_sk->sk_state = LLCP_CONNECTED; /* Wake the listening processes */ parent->sk_data_ready(parent); /* Send CC */ nfc_llcp_send_cc(new_sock); release_sock(&sock->sk); sock_put(&sock->sk); return; fail: /* Send DM */ nfc_llcp_send_dm(local, dsap, ssap, reason); } int nfc_llcp_queue_i_frames(struct nfc_llcp_sock *sock) { int nr_frames = 0; struct nfc_llcp_local *local = sock->local; pr_debug("Remote ready %d tx queue len %d remote rw %d", sock->remote_ready, skb_queue_len(&sock->tx_pending_queue), sock->remote_rw); /* Try to queue some I frames for transmission */ while (sock->remote_ready && skb_queue_len(&sock->tx_pending_queue) < sock->remote_rw) { struct sk_buff *pdu; pdu = skb_dequeue(&sock->tx_queue); if (pdu == NULL) break; /* Update N(S)/N(R) */ nfc_llcp_set_nrns(sock, pdu); skb_queue_tail(&local->tx_queue, pdu); nr_frames++; } return nr_frames; } static void nfc_llcp_recv_hdlc(struct nfc_llcp_local *local, struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct sock *sk; u8 dsap, ssap, ptype, ns, nr; ptype = nfc_llcp_ptype(skb); dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); ns = nfc_llcp_ns(skb); nr = nfc_llcp_nr(skb); pr_debug("%d %d R %d S %d\n", dsap, ssap, nr, ns); llcp_sock = nfc_llcp_sock_get(local, dsap, ssap); if (llcp_sock == NULL) { nfc_llcp_send_dm(local, dsap, ssap, LLCP_DM_NOCONN); return; } sk = &llcp_sock->sk; lock_sock(sk); if (sk->sk_state == LLCP_CLOSED) { release_sock(sk); nfc_llcp_sock_put(llcp_sock); } /* Pass the payload upstream */ if (ptype == LLCP_PDU_I) { pr_debug("I frame, queueing on %p\n", &llcp_sock->sk); if (ns == llcp_sock->recv_n) llcp_sock->recv_n = (llcp_sock->recv_n + 1) % 16; else pr_err("Received out of sequence I PDU\n"); skb_pull(skb, LLCP_HEADER_SIZE + LLCP_SEQUENCE_SIZE); if (!sock_queue_rcv_skb(&llcp_sock->sk, skb)) { /* * I frames will be freed from the socket layer, so we * need to keep them alive until someone receives them. */ skb_get(skb); } else { pr_err("Receive queue is full\n"); } } /* Remove skbs from the pending queue */ if (llcp_sock->send_ack_n != nr) { struct sk_buff *s, *tmp; u8 n; llcp_sock->send_ack_n = nr; /* Remove and free all skbs until ns == nr */ skb_queue_walk_safe(&llcp_sock->tx_pending_queue, s, tmp) { n = nfc_llcp_ns(s); skb_unlink(s, &llcp_sock->tx_pending_queue); kfree_skb(s); if (n == nr) break; } /* Re-queue the remaining skbs for transmission */ skb_queue_reverse_walk_safe(&llcp_sock->tx_pending_queue, s, tmp) { skb_unlink(s, &llcp_sock->tx_pending_queue); skb_queue_head(&local->tx_queue, s); } } if (ptype == LLCP_PDU_RR) llcp_sock->remote_ready = true; else if (ptype == LLCP_PDU_RNR) llcp_sock->remote_ready = false; if (nfc_llcp_queue_i_frames(llcp_sock) == 0 && ptype == LLCP_PDU_I) nfc_llcp_send_rr(llcp_sock); release_sock(sk); nfc_llcp_sock_put(llcp_sock); } static void nfc_llcp_recv_disc(struct nfc_llcp_local *local, const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct sock *sk; u8 dsap, ssap; dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); if ((dsap == 0) && (ssap == 0)) { pr_debug("Connection termination"); nfc_dep_link_down(local->dev); return; } llcp_sock = nfc_llcp_sock_get(local, dsap, ssap); if (llcp_sock == NULL) { nfc_llcp_send_dm(local, dsap, ssap, LLCP_DM_NOCONN); return; } sk = &llcp_sock->sk; lock_sock(sk); nfc_llcp_socket_purge(llcp_sock); if (sk->sk_state == LLCP_CLOSED) { release_sock(sk); nfc_llcp_sock_put(llcp_sock); } if (sk->sk_state == LLCP_CONNECTED) { nfc_put_device(local->dev); sk->sk_state = LLCP_CLOSED; sk->sk_state_change(sk); } nfc_llcp_send_dm(local, dsap, ssap, LLCP_DM_DISC); release_sock(sk); nfc_llcp_sock_put(llcp_sock); } static void nfc_llcp_recv_cc(struct nfc_llcp_local *local, const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct sock *sk; u8 dsap, ssap; dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); llcp_sock = nfc_llcp_connecting_sock_get(local, dsap); if (llcp_sock == NULL) { pr_err("Invalid CC\n"); nfc_llcp_send_dm(local, dsap, ssap, LLCP_DM_NOCONN); return; } sk = &llcp_sock->sk; /* Unlink from connecting and link to the client array */ nfc_llcp_sock_unlink(&local->connecting_sockets, sk); nfc_llcp_sock_link(&local->sockets, sk); llcp_sock->dsap = ssap; nfc_llcp_parse_connection_tlv(llcp_sock, &skb->data[LLCP_HEADER_SIZE], skb->len - LLCP_HEADER_SIZE); sk->sk_state = LLCP_CONNECTED; sk->sk_state_change(sk); nfc_llcp_sock_put(llcp_sock); } static void nfc_llcp_recv_dm(struct nfc_llcp_local *local, const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct sock *sk; u8 dsap, ssap, reason; dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); reason = skb->data[2]; pr_debug("%d %d reason %d\n", ssap, dsap, reason); switch (reason) { case LLCP_DM_NOBOUND: case LLCP_DM_REJ: llcp_sock = nfc_llcp_connecting_sock_get(local, dsap); break; default: llcp_sock = nfc_llcp_sock_get(local, dsap, ssap); break; } if (llcp_sock == NULL) { pr_debug("Already closed\n"); return; } sk = &llcp_sock->sk; sk->sk_err = ENXIO; sk->sk_state = LLCP_CLOSED; sk->sk_state_change(sk); nfc_llcp_sock_put(llcp_sock); } static void nfc_llcp_recv_snl(struct nfc_llcp_local *local, const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; u8 dsap, ssap, type, length, tid, sap; const u8 *tlv; u16 tlv_len, offset; const char *service_name; size_t service_name_len; struct nfc_llcp_sdp_tlv *sdp; HLIST_HEAD(llc_sdres_list); size_t sdres_tlvs_len; HLIST_HEAD(nl_sdres_list); dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); pr_debug("%d %d\n", dsap, ssap); if (dsap != LLCP_SAP_SDP || ssap != LLCP_SAP_SDP) { pr_err("Wrong SNL SAP\n"); return; } tlv = &skb->data[LLCP_HEADER_SIZE]; tlv_len = skb->len - LLCP_HEADER_SIZE; offset = 0; sdres_tlvs_len = 0; while (offset < tlv_len) { type = tlv[0]; length = tlv[1]; switch (type) { case LLCP_TLV_SDREQ: tid = tlv[2]; service_name = (char *) &tlv[3]; service_name_len = length - 1; pr_debug("Looking for %.16s\n", service_name); if (service_name_len == strlen("urn:nfc:sn:sdp") && !strncmp(service_name, "urn:nfc:sn:sdp", service_name_len)) { sap = 1; goto add_snl; } llcp_sock = nfc_llcp_sock_from_sn(local, service_name, service_name_len, true); if (!llcp_sock) { sap = 0; goto add_snl; } /* * We found a socket but its ssap has not been reserved * yet. We need to assign it for good and send a reply. * The ssap will be freed when the socket is closed. */ if (llcp_sock->ssap == LLCP_SDP_UNBOUND) { atomic_t *client_count; sap = nfc_llcp_reserve_sdp_ssap(local); pr_debug("Reserving %d\n", sap); if (sap == LLCP_SAP_MAX) { sap = 0; nfc_llcp_sock_put(llcp_sock); goto add_snl; } client_count = &local->local_sdp_cnt[sap - LLCP_WKS_NUM_SAP]; atomic_inc(client_count); llcp_sock->ssap = sap; llcp_sock->reserved_ssap = sap; } else { sap = llcp_sock->ssap; } pr_debug("%p %d\n", llcp_sock, sap); nfc_llcp_sock_put(llcp_sock); add_snl: sdp = nfc_llcp_build_sdres_tlv(tid, sap); if (sdp == NULL) goto exit; sdres_tlvs_len += sdp->tlv_len; hlist_add_head(&sdp->node, &llc_sdres_list); break; case LLCP_TLV_SDRES: mutex_lock(&local->sdreq_lock); pr_debug("LLCP_TLV_SDRES: searching tid %d\n", tlv[2]); hlist_for_each_entry(sdp, &local->pending_sdreqs, node) { if (sdp->tid != tlv[2]) continue; sdp->sap = tlv[3]; pr_debug("Found: uri=%s, sap=%d\n", sdp->uri, sdp->sap); hlist_del(&sdp->node); hlist_add_head(&sdp->node, &nl_sdres_list); break; } mutex_unlock(&local->sdreq_lock); break; default: pr_err("Invalid SNL tlv value 0x%x\n", type); break; } offset += length + 2; tlv += length + 2; } exit: if (!hlist_empty(&nl_sdres_list)) nfc_genl_llc_send_sdres(local->dev, &nl_sdres_list); if (!hlist_empty(&llc_sdres_list)) nfc_llcp_send_snl_sdres(local, &llc_sdres_list, sdres_tlvs_len); } static void nfc_llcp_recv_agf(struct nfc_llcp_local *local, struct sk_buff *skb) { u8 ptype; u16 pdu_len; struct sk_buff *new_skb; if (skb->len <= LLCP_HEADER_SIZE) { pr_err("Malformed AGF PDU\n"); return; } skb_pull(skb, LLCP_HEADER_SIZE); while (skb->len > LLCP_AGF_PDU_HEADER_SIZE) { pdu_len = skb->data[0] << 8 | skb->data[1]; skb_pull(skb, LLCP_AGF_PDU_HEADER_SIZE); if (pdu_len < LLCP_HEADER_SIZE || pdu_len > skb->len) { pr_err("Malformed AGF PDU\n"); return; } ptype = nfc_llcp_ptype(skb); if (ptype == LLCP_PDU_SYMM || ptype == LLCP_PDU_AGF) goto next; new_skb = nfc_alloc_recv_skb(pdu_len, GFP_KERNEL); if (new_skb == NULL) { pr_err("Could not allocate PDU\n"); return; } skb_put_data(new_skb, skb->data, pdu_len); nfc_llcp_rx_skb(local, new_skb); kfree_skb(new_skb); next: skb_pull(skb, pdu_len); } } static void nfc_llcp_rx_skb(struct nfc_llcp_local *local, struct sk_buff *skb) { u8 dsap, ssap, ptype; ptype = nfc_llcp_ptype(skb); dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); pr_debug("ptype 0x%x dsap 0x%x ssap 0x%x\n", ptype, dsap, ssap); if (ptype != LLCP_PDU_SYMM) print_hex_dump_debug("LLCP Rx: ", DUMP_PREFIX_OFFSET, 16, 1, skb->data, skb->len, true); switch (ptype) { case LLCP_PDU_SYMM: pr_debug("SYMM\n"); break; case LLCP_PDU_UI: pr_debug("UI\n"); nfc_llcp_recv_ui(local, skb); break; case LLCP_PDU_CONNECT: pr_debug("CONNECT\n"); nfc_llcp_recv_connect(local, skb); break; case LLCP_PDU_DISC: pr_debug("DISC\n"); nfc_llcp_recv_disc(local, skb); break; case LLCP_PDU_CC: pr_debug("CC\n"); nfc_llcp_recv_cc(local, skb); break; case LLCP_PDU_DM: pr_debug("DM\n"); nfc_llcp_recv_dm(local, skb); break; case LLCP_PDU_SNL: pr_debug("SNL\n"); nfc_llcp_recv_snl(local, skb); break; case LLCP_PDU_I: case LLCP_PDU_RR: case LLCP_PDU_RNR: pr_debug("I frame\n"); nfc_llcp_recv_hdlc(local, skb); break; case LLCP_PDU_AGF: pr_debug("AGF frame\n"); nfc_llcp_recv_agf(local, skb); break; } } static void nfc_llcp_rx_work(struct work_struct *work) { struct nfc_llcp_local *local = container_of(work, struct nfc_llcp_local, rx_work); struct sk_buff *skb; skb = local->rx_pending; if (skb == NULL) { pr_debug("No pending SKB\n"); return; } __net_timestamp(skb); nfc_llcp_send_to_raw_sock(local, skb, NFC_DIRECTION_RX); nfc_llcp_rx_skb(local, skb); schedule_work(&local->tx_work); kfree_skb(local->rx_pending); local->rx_pending = NULL; } static void __nfc_llcp_recv(struct nfc_llcp_local *local, struct sk_buff *skb) { local->rx_pending = skb; timer_delete(&local->link_timer); schedule_work(&local->rx_work); } void nfc_llcp_recv(void *data, struct sk_buff *skb, int err) { struct nfc_llcp_local *local = (struct nfc_llcp_local *) data; if (err < 0) { pr_err("LLCP PDU receive err %d\n", err); return; } __nfc_llcp_recv(local, skb); } int nfc_llcp_data_received(struct nfc_dev *dev, struct sk_buff *skb) { struct nfc_llcp_local *local; local = nfc_llcp_find_local(dev); if (local == NULL) { kfree_skb(skb); return -ENODEV; } __nfc_llcp_recv(local, skb); nfc_llcp_local_put(local); return 0; } void nfc_llcp_mac_is_down(struct nfc_dev *dev) { struct nfc_llcp_local *local; local = nfc_llcp_find_local(dev); if (local == NULL) return; local->remote_miu = LLCP_DEFAULT_MIU; local->remote_lto = LLCP_DEFAULT_LTO; /* Close and purge all existing sockets */ nfc_llcp_socket_release(local, true, 0); nfc_llcp_local_put(local); } void nfc_llcp_mac_is_up(struct nfc_dev *dev, u32 target_idx, u8 comm_mode, u8 rf_mode) { struct nfc_llcp_local *local; pr_debug("rf mode %d\n", rf_mode); local = nfc_llcp_find_local(dev); if (local == NULL) return; local->target_idx = target_idx; local->comm_mode = comm_mode; local->rf_mode = rf_mode; if (rf_mode == NFC_RF_INITIATOR) { pr_debug("Queueing Tx work\n"); schedule_work(&local->tx_work); } else { mod_timer(&local->link_timer, jiffies + msecs_to_jiffies(local->remote_lto)); } nfc_llcp_local_put(local); } int nfc_llcp_register_device(struct nfc_dev *ndev) { struct nfc_llcp_local *local; local = kzalloc(sizeof(struct nfc_llcp_local), GFP_KERNEL); if (local == NULL) return -ENOMEM; /* As we are going to initialize local's refcount, we need to get the * nfc_dev to avoid UAF, otherwise there is no point in continuing. * See nfc_llcp_local_get(). */ local->dev = nfc_get_device(ndev->idx); if (!local->dev) { kfree(local); return -ENODEV; } INIT_LIST_HEAD(&local->list); kref_init(&local->ref); mutex_init(&local->sdp_lock); timer_setup(&local->link_timer, nfc_llcp_symm_timer, 0); skb_queue_head_init(&local->tx_queue); INIT_WORK(&local->tx_work, nfc_llcp_tx_work); local->rx_pending = NULL; INIT_WORK(&local->rx_work, nfc_llcp_rx_work); INIT_WORK(&local->timeout_work, nfc_llcp_timeout_work); rwlock_init(&local->sockets.lock); rwlock_init(&local->connecting_sockets.lock); rwlock_init(&local->raw_sockets.lock); local->lto = 150; /* 1500 ms */ local->rw = LLCP_MAX_RW; local->miux = cpu_to_be16(LLCP_MAX_MIUX); local->local_wks = 0x1; /* LLC Link Management */ nfc_llcp_build_gb(local); local->remote_miu = LLCP_DEFAULT_MIU; local->remote_lto = LLCP_DEFAULT_LTO; mutex_init(&local->sdreq_lock); INIT_HLIST_HEAD(&local->pending_sdreqs); timer_setup(&local->sdreq_timer, nfc_llcp_sdreq_timer, 0); INIT_WORK(&local->sdreq_timeout_work, nfc_llcp_sdreq_timeout_work); spin_lock(&llcp_devices_lock); list_add(&local->list, &llcp_devices); spin_unlock(&llcp_devices_lock); return 0; } void nfc_llcp_unregister_device(struct nfc_dev *dev) { struct nfc_llcp_local *local = nfc_llcp_remove_local(dev); if (local == NULL) { pr_debug("No such device\n"); return; } local_cleanup(local); nfc_llcp_local_put(local); } int __init nfc_llcp_init(void) { return nfc_llcp_sock_init(); } void nfc_llcp_exit(void) { nfc_llcp_sock_exit(); } |
| 79 1 3 35 35 25 121 42 117 23 23 23 2 15 6 1 15 41 41 41 41 1 1 1 1 39 39 41 41 20 20 15 20 20 1 20 15 15 14 6 9 6 92 67 67 67 67 47 41 67 67 47 47 41 8 1 41 66 1 66 10 60 67 18 52 67 46 92 20 25 19 1 10 20 35 5 24 23 16 58 4 5 63 6 63 18 5 63 5 5 5 1 1 1 5 56 55 134 117 134 1 1 1 1 8 1 1 1 1 29 13 29 29 7 7 7 7 52 4 8 2 2 2 2 2 3 3 3 3 3 3 2 3 3 118 118 118 148 28 28 27 139 149 45 149 80 26 2 52 56 53 45 6 45 14 13 1 3 1 1 54 56 57 57 57 53 14 57 3 2 2 2 2 2 15 15 14 23 23 23 22 16 15 3 14 9 8 8 6 6 2 2 2 23 23 23 23 23 16 16 14 6 23 21 12 11 3 12 11 11 9 8 8 8 8 1 1 1 11 1 1 11 1 22 29 30 30 3 13 13 9 2 9 4 2 9 8 3 9 8 4 30 7 7 4 2 4 4 4 9 9 7 7 5 7 7 9 7 2 2 2 2 4 3 2 4 4 1 1 1 1 1 7 7 15 5 3 3 5 29 3 29 29 58 11 2 4 11 4 58 9 8 9 8 2 2 2 2 2 2 588 107 9 9 2 587 13 13 9 11 11 7 11 11 11 11 1 3 1 1 1 1 25 10 5 15 10 15 12 2 10 7 7 15 3 5 5 10 10 4 3 9 492 95 16 16 16 23 23 23 19 17 12 12 4 17 13 13 7 13 9 10 3 13 25 19 23 6 25 25 4 4 4 2 2 4 4 7 7 7 7 7 5 5 28 25 28 58 7 1 58 4 4 10 4 1 1 9 4 4 4 4 4 4 4 4 4 4 2 4 4 2 4 4 3 4 2 1 1 4 4 10 3 3 7 2 2 2 2 2 2 2 7 4 1 1 23 15 15 9 1 9 7 9 9 9 2 2 4 4 1 52 125 215 180 39 7 1 6 34 7 5 1 1 13 41 35 213 111 33 40 111 33 7 33 36 56 56 30 24 1 23 192 184 185 7 36 36 36 36 30 30 36 6 36 35 36 35 1 36 36 36 36 192 113 114 20 13 13 13 12 13 1 187 158 186 30 30 30 30 30 30 30 30 566 568 8 106 106 567 117 5 111 111 14 13 13 13 13 1 1 1 1 1 1 1 27 3 3 2 2 2 2 2 27 4 4 25 25 25 25 25 25 21 24 25 527 107 107 1 4 4 4 4 1 20 1 20 4 4 4 4 5 5 2 2 5 2 1 2 1 7 10 7 11 10 7 4 7 7 4 3 1 3 1 10 5 10 3 1 3 4 4 2 3 3 4 586 586 585 16 16 3 3 13 7 6 1 13 6 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 | // SPDX-License-Identifier: GPL-2.0-only /* * Local APIC virtualization * * Copyright (C) 2006 Qumranet, Inc. * Copyright (C) 2007 Novell * Copyright (C) 2007 Intel * Copyright 2009 Red Hat, Inc. and/or its affiliates. * * Authors: * Dor Laor <dor.laor@qumranet.com> * Gregory Haskins <ghaskins@novell.com> * Yaozu (Eddie) Dong <eddie.dong@intel.com> * * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include <linux/kvm.h> #include <linux/mm.h> #include <linux/highmem.h> #include <linux/smp.h> #include <linux/hrtimer.h> #include <linux/io.h> #include <linux/export.h> #include <linux/math64.h> #include <linux/slab.h> #include <asm/apic.h> #include <asm/processor.h> #include <asm/mce.h> #include <asm/msr.h> #include <asm/page.h> #include <asm/current.h> #include <asm/apicdef.h> #include <asm/delay.h> #include <linux/atomic.h> #include <linux/jump_label.h> #include "kvm_cache_regs.h" #include "irq.h" #include "ioapic.h" #include "trace.h" #include "x86.h" #include "xen.h" #include "cpuid.h" #include "hyperv.h" #include "smm.h" #ifndef CONFIG_X86_64 #define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) #else #define mod_64(x, y) ((x) % (y)) #endif /* 14 is the version for Xeon and Pentium 8.4.8*/ #define APIC_VERSION 0x14UL #define LAPIC_MMIO_LENGTH (1 << 12) /* * Enable local APIC timer advancement (tscdeadline mode only) with adaptive * tuning. When enabled, KVM programs the host timer event to fire early, i.e. * before the deadline expires, to account for the delay between taking the * VM-Exit (to inject the guest event) and the subsequent VM-Enter to resume * the guest, i.e. so that the interrupt arrives in the guest with minimal * latency relative to the deadline programmed by the guest. */ static bool lapic_timer_advance __read_mostly = true; module_param(lapic_timer_advance, bool, 0444); #define LAPIC_TIMER_ADVANCE_ADJUST_MIN 100 /* clock cycles */ #define LAPIC_TIMER_ADVANCE_ADJUST_MAX 10000 /* clock cycles */ #define LAPIC_TIMER_ADVANCE_NS_INIT 1000 #define LAPIC_TIMER_ADVANCE_NS_MAX 5000 /* step-by-step approximation to mitigate fluctuation */ #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 static bool __read_mostly vector_hashing_enabled = true; module_param_named(vector_hashing, vector_hashing_enabled, bool, 0444); static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data); static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data); static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) { apic_set_reg(apic->regs, reg_off, val); } static __always_inline u64 kvm_lapic_get_reg64(struct kvm_lapic *apic, int reg) { return apic_get_reg64(apic->regs, reg); } static __always_inline void kvm_lapic_set_reg64(struct kvm_lapic *apic, int reg, u64 val) { apic_set_reg64(apic->regs, reg, val); } bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector) { struct kvm_lapic *apic = vcpu->arch.apic; return apic_test_vector(vector, apic->regs + APIC_ISR) || apic_test_vector(vector, apic->regs + APIC_IRR); } __read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu); EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_has_noapic_vcpu); __read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_hw_disabled, HZ); __read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_sw_disabled, HZ); static inline int apic_enabled(struct kvm_lapic *apic) { return kvm_apic_sw_enabled(apic) && kvm_apic_hw_enabled(apic); } #define LVT_MASK \ (APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK) #define LINT_MASK \ (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) static inline u32 kvm_x2apic_id(struct kvm_lapic *apic) { return apic->vcpu->vcpu_id; } static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu) { return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) && (kvm_mwait_in_guest(vcpu->kvm) || kvm_hlt_in_guest(vcpu->kvm)); } static bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu) { return kvm_x86_ops.set_hv_timer && !(kvm_mwait_in_guest(vcpu->kvm) || kvm_can_post_timer_interrupt(vcpu)); } static bool kvm_use_posted_timer_interrupt(struct kvm_vcpu *vcpu) { return kvm_can_post_timer_interrupt(vcpu) && vcpu->mode == IN_GUEST_MODE; } static inline u32 kvm_apic_calc_x2apic_ldr(u32 id) { return ((id >> 4) << 16) | (1 << (id & 0xf)); } static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) { switch (map->logical_mode) { case KVM_APIC_MODE_SW_DISABLED: /* Arbitrarily use the flat map so that @cluster isn't NULL. */ *cluster = map->xapic_flat_map; *mask = 0; return true; case KVM_APIC_MODE_X2APIC: { u32 offset = (dest_id >> 16) * 16; u32 max_apic_id = map->max_apic_id; if (offset <= max_apic_id) { u8 cluster_size = min(max_apic_id - offset + 1, 16U); offset = array_index_nospec(offset, map->max_apic_id + 1); *cluster = &map->phys_map[offset]; *mask = dest_id & (0xffff >> (16 - cluster_size)); } else { *mask = 0; } return true; } case KVM_APIC_MODE_XAPIC_FLAT: *cluster = map->xapic_flat_map; *mask = dest_id & 0xff; return true; case KVM_APIC_MODE_XAPIC_CLUSTER: *cluster = map->xapic_cluster_map[(dest_id >> 4) & 0xf]; *mask = dest_id & 0xf; return true; case KVM_APIC_MODE_MAP_DISABLED: return false; default: WARN_ON_ONCE(1); return false; } } static int kvm_recalculate_phys_map(struct kvm_apic_map *new, struct kvm_vcpu *vcpu, bool *xapic_id_mismatch) { struct kvm_lapic *apic = vcpu->arch.apic; u32 x2apic_id = kvm_x2apic_id(apic); u32 xapic_id = kvm_xapic_id(apic); u32 physical_id; /* * For simplicity, KVM always allocates enough space for all possible * xAPIC IDs. Yell, but don't kill the VM, as KVM can continue on * without the optimized map. */ if (WARN_ON_ONCE(xapic_id > new->max_apic_id)) return -EINVAL; /* * Bail if a vCPU was added and/or enabled its APIC between allocating * the map and doing the actual calculations for the map. Note, KVM * hardcodes the x2APIC ID to vcpu_id, i.e. there's no TOCTOU bug if * the compiler decides to reload x2apic_id after this check. */ if (x2apic_id > new->max_apic_id) return -E2BIG; /* * Deliberately truncate the vCPU ID when detecting a mismatched APIC * ID to avoid false positives if the vCPU ID, i.e. x2APIC ID, is a * 32-bit value. Any unwanted aliasing due to truncation results will * be detected below. */ if (!apic_x2apic_mode(apic) && xapic_id != (u8)vcpu->vcpu_id) *xapic_id_mismatch = true; /* * Apply KVM's hotplug hack if userspace has enable 32-bit APIC IDs. * Allow sending events to vCPUs by their x2APIC ID even if the target * vCPU is in legacy xAPIC mode, and silently ignore aliased xAPIC IDs * (the x2APIC ID is truncated to 8 bits, causing IDs > 0xff to wrap * and collide). * * Honor the architectural (and KVM's non-optimized) behavior if * userspace has not enabled 32-bit x2APIC IDs. Each APIC is supposed * to process messages independently. If multiple vCPUs have the same * effective APIC ID, e.g. due to the x2APIC wrap or because the guest * manually modified its xAPIC IDs, events targeting that ID are * supposed to be recognized by all vCPUs with said ID. */ if (vcpu->kvm->arch.x2apic_format) { /* See also kvm_apic_match_physical_addr(). */ if (apic_x2apic_mode(apic) || x2apic_id > 0xff) new->phys_map[x2apic_id] = apic; if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id]) new->phys_map[xapic_id] = apic; } else { /* * Disable the optimized map if the physical APIC ID is already * mapped, i.e. is aliased to multiple vCPUs. The optimized * map requires a strict 1:1 mapping between IDs and vCPUs. */ if (apic_x2apic_mode(apic)) physical_id = x2apic_id; else physical_id = xapic_id; if (new->phys_map[physical_id]) return -EINVAL; new->phys_map[physical_id] = apic; } return 0; } static void kvm_recalculate_logical_map(struct kvm_apic_map *new, struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; enum kvm_apic_logical_mode logical_mode; struct kvm_lapic **cluster; u16 mask; u32 ldr; if (new->logical_mode == KVM_APIC_MODE_MAP_DISABLED) return; if (!kvm_apic_sw_enabled(apic)) return; ldr = kvm_lapic_get_reg(apic, APIC_LDR); if (!ldr) return; if (apic_x2apic_mode(apic)) { logical_mode = KVM_APIC_MODE_X2APIC; } else { ldr = GET_APIC_LOGICAL_ID(ldr); if (kvm_lapic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT) logical_mode = KVM_APIC_MODE_XAPIC_FLAT; else logical_mode = KVM_APIC_MODE_XAPIC_CLUSTER; } /* * To optimize logical mode delivery, all software-enabled APICs must * be configured for the same mode. */ if (new->logical_mode == KVM_APIC_MODE_SW_DISABLED) { new->logical_mode = logical_mode; } else if (new->logical_mode != logical_mode) { new->logical_mode = KVM_APIC_MODE_MAP_DISABLED; return; } /* * In x2APIC mode, the LDR is read-only and derived directly from the * x2APIC ID, thus is guaranteed to be addressable. KVM reuses * kvm_apic_map.phys_map to optimize logical mode x2APIC interrupts by * reversing the LDR calculation to get cluster of APICs, i.e. no * additional work is required. */ if (apic_x2apic_mode(apic)) return; if (WARN_ON_ONCE(!kvm_apic_map_get_logical_dest(new, ldr, &cluster, &mask))) { new->logical_mode = KVM_APIC_MODE_MAP_DISABLED; return; } if (!mask) return; ldr = ffs(mask) - 1; if (!is_power_of_2(mask) || cluster[ldr]) new->logical_mode = KVM_APIC_MODE_MAP_DISABLED; else cluster[ldr] = apic; } /* * CLEAN -> DIRTY and UPDATE_IN_PROGRESS -> DIRTY changes happen without a lock. * * DIRTY -> UPDATE_IN_PROGRESS and UPDATE_IN_PROGRESS -> CLEAN happen with * apic_map_lock_held. */ enum { CLEAN, UPDATE_IN_PROGRESS, DIRTY }; static void kvm_recalculate_apic_map(struct kvm *kvm) { struct kvm_apic_map *new, *old = NULL; struct kvm_vcpu *vcpu; unsigned long i; u32 max_id = 255; /* enough space for any xAPIC ID */ bool xapic_id_mismatch; int r; /* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map. */ if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN) return; WARN_ONCE(!irqchip_in_kernel(kvm), "Dirty APIC map without an in-kernel local APIC"); mutex_lock(&kvm->arch.apic_map_lock); retry: /* * Read kvm->arch.apic_map_dirty before kvm->arch.apic_map (if clean) * or the APIC registers (if dirty). Note, on retry the map may have * not yet been marked dirty by whatever task changed a vCPU's x2APIC * ID, i.e. the map may still show up as in-progress. In that case * this task still needs to retry and complete its calculation. */ if (atomic_cmpxchg_acquire(&kvm->arch.apic_map_dirty, DIRTY, UPDATE_IN_PROGRESS) == CLEAN) { /* Someone else has updated the map. */ mutex_unlock(&kvm->arch.apic_map_lock); return; } /* * Reset the mismatch flag between attempts so that KVM does the right * thing if a vCPU changes its xAPIC ID, but do NOT reset max_id, i.e. * keep max_id strictly increasing. Disallowing max_id from shrinking * ensures KVM won't get stuck in an infinite loop, e.g. if the vCPU * with the highest x2APIC ID is toggling its APIC on and off. */ xapic_id_mismatch = false; kvm_for_each_vcpu(i, vcpu, kvm) if (kvm_apic_present(vcpu)) max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic)); new = kvzalloc(sizeof(struct kvm_apic_map) + sizeof(struct kvm_lapic *) * ((u64)max_id + 1), GFP_KERNEL_ACCOUNT); if (!new) goto out; new->max_apic_id = max_id; new->logical_mode = KVM_APIC_MODE_SW_DISABLED; kvm_for_each_vcpu(i, vcpu, kvm) { if (!kvm_apic_present(vcpu)) continue; r = kvm_recalculate_phys_map(new, vcpu, &xapic_id_mismatch); if (r) { kvfree(new); new = NULL; if (r == -E2BIG) { cond_resched(); goto retry; } goto out; } kvm_recalculate_logical_map(new, vcpu); } out: /* * The optimized map is effectively KVM's internal version of APICv, * and all unwanted aliasing that results in disabling the optimized * map also applies to APICv. */ if (!new) kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED); else kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED); if (!new || new->logical_mode == KVM_APIC_MODE_MAP_DISABLED) kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED); else kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED); if (xapic_id_mismatch) kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED); else kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED); old = rcu_dereference_protected(kvm->arch.apic_map, lockdep_is_held(&kvm->arch.apic_map_lock)); rcu_assign_pointer(kvm->arch.apic_map, new); /* * Write kvm->arch.apic_map before clearing apic->apic_map_dirty. * If another update has come in, leave it DIRTY. */ atomic_cmpxchg_release(&kvm->arch.apic_map_dirty, UPDATE_IN_PROGRESS, CLEAN); mutex_unlock(&kvm->arch.apic_map_lock); if (old) kvfree_rcu(old, rcu); kvm_make_scan_ioapic_request(kvm); } static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) { bool enabled = val & APIC_SPIV_APIC_ENABLED; kvm_lapic_set_reg(apic, APIC_SPIV, val); if (enabled != apic->sw_enabled) { apic->sw_enabled = enabled; if (enabled) static_branch_slow_dec_deferred(&apic_sw_disabled); else static_branch_inc(&apic_sw_disabled.key); atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); } /* Check if there are APF page ready requests pending */ if (enabled) { kvm_make_request(KVM_REQ_APF_READY, apic->vcpu); kvm_xen_sw_enable_lapic(apic->vcpu); } } static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id) { kvm_lapic_set_reg(apic, APIC_ID, id << 24); atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); } static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id) { kvm_lapic_set_reg(apic, APIC_LDR, id); atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); } static inline void kvm_apic_set_dfr(struct kvm_lapic *apic, u32 val) { kvm_lapic_set_reg(apic, APIC_DFR, val); atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); } static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id) { u32 ldr = kvm_apic_calc_x2apic_ldr(id); WARN_ON_ONCE(id != apic->vcpu->vcpu_id); kvm_lapic_set_reg(apic, APIC_ID, id); kvm_lapic_set_reg(apic, APIC_LDR, ldr); atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); } static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type) { return !(kvm_lapic_get_reg(apic, lvt_type) & APIC_LVT_MASKED); } static inline int apic_lvtt_oneshot(struct kvm_lapic *apic) { return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_ONESHOT; } static inline int apic_lvtt_period(struct kvm_lapic *apic) { return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_PERIODIC; } static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic) { return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_TSCDEADLINE; } static inline int apic_lvt_nmi_mode(u32 lvt_val) { return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI; } static inline bool kvm_lapic_lvt_supported(struct kvm_lapic *apic, int lvt_index) { return apic->nr_lvt_entries > lvt_index; } static inline int kvm_apic_calc_nr_lvt_entries(struct kvm_vcpu *vcpu) { return KVM_APIC_MAX_NR_LVT_ENTRIES - !(vcpu->arch.mcg_cap & MCG_CMCI_P); } void kvm_apic_set_version(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; u32 v = 0; if (!lapic_in_kernel(vcpu)) return; v = APIC_VERSION | ((apic->nr_lvt_entries - 1) << 16); /* * KVM emulates 82093AA datasheet (with in-kernel IOAPIC implementation) * which doesn't have EOI register; Some buggy OSes (e.g. Windows with * Hyper-V role) disable EOI broadcast in lapic not checking for IOAPIC * version first and level-triggered interrupts never get EOIed in * IOAPIC. */ if (guest_cpu_cap_has(vcpu, X86_FEATURE_X2APIC) && !ioapic_in_kernel(vcpu->kvm)) v |= APIC_LVR_DIRECTED_EOI; kvm_lapic_set_reg(apic, APIC_LVR, v); } void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu) { int nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu); struct kvm_lapic *apic = vcpu->arch.apic; int i; if (!lapic_in_kernel(vcpu) || nr_lvt_entries == apic->nr_lvt_entries) return; /* Initialize/mask any "new" LVT entries. */ for (i = apic->nr_lvt_entries; i < nr_lvt_entries; i++) kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED); apic->nr_lvt_entries = nr_lvt_entries; /* The number of LVT entries is reflected in the version register. */ kvm_apic_set_version(vcpu); } static const unsigned int apic_lvt_mask[KVM_APIC_MAX_NR_LVT_ENTRIES] = { [LVT_TIMER] = LVT_MASK, /* timer mode mask added at runtime */ [LVT_THERMAL_MONITOR] = LVT_MASK | APIC_MODE_MASK, [LVT_PERFORMANCE_COUNTER] = LVT_MASK | APIC_MODE_MASK, [LVT_LINT0] = LINT_MASK, [LVT_LINT1] = LINT_MASK, [LVT_ERROR] = LVT_MASK, [LVT_CMCI] = LVT_MASK | APIC_MODE_MASK }; static u8 count_vectors(void *bitmap) { int vec; u32 *reg; u8 count = 0; for (vec = 0; vec < MAX_APIC_VECTOR; vec += APIC_VECTORS_PER_REG) { reg = bitmap + APIC_VECTOR_TO_REG_OFFSET(vec); count += hweight32(*reg); } return count; } bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr) { unsigned long pir_vals[NR_PIR_WORDS]; u32 *__pir = (void *)pir_vals; u32 i, vec; u32 irr_val, prev_irr_val; int max_updated_irr; max_updated_irr = -1; *max_irr = -1; if (!pi_harvest_pir(pir, pir_vals)) return false; for (i = vec = 0; i <= 7; i++, vec += 32) { u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10); irr_val = READ_ONCE(*p_irr); if (__pir[i]) { prev_irr_val = irr_val; do { irr_val = prev_irr_val | __pir[i]; } while (prev_irr_val != irr_val && !try_cmpxchg(p_irr, &prev_irr_val, irr_val)); if (prev_irr_val != irr_val) max_updated_irr = __fls(irr_val ^ prev_irr_val) + vec; } if (irr_val) *max_irr = __fls(irr_val) + vec; } return ((max_updated_irr != -1) && (max_updated_irr == *max_irr)); } EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_apic_update_irr); bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr) { struct kvm_lapic *apic = vcpu->arch.apic; bool irr_updated = __kvm_apic_update_irr(pir, apic->regs, max_irr); if (unlikely(!apic->apicv_active && irr_updated)) apic->irr_pending = true; return irr_updated; } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_update_irr); static inline int apic_search_irr(struct kvm_lapic *apic) { return apic_find_highest_vector(apic->regs + APIC_IRR); } static inline int apic_find_highest_irr(struct kvm_lapic *apic) { int result; /* * Note that irr_pending is just a hint. It will be always * true with virtual interrupt delivery enabled. */ if (!apic->irr_pending) return -1; result = apic_search_irr(apic); ASSERT(result == -1 || result >= 16); return result; } static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) { if (unlikely(apic->apicv_active)) { apic_clear_vector(vec, apic->regs + APIC_IRR); } else { apic->irr_pending = false; apic_clear_vector(vec, apic->regs + APIC_IRR); if (apic_search_irr(apic) != -1) apic->irr_pending = true; } } void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec) { apic_clear_irr(vec, vcpu->arch.apic); } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_clear_irr); static void *apic_vector_to_isr(int vec, struct kvm_lapic *apic) { return apic->regs + APIC_ISR + APIC_VECTOR_TO_REG_OFFSET(vec); } static inline void apic_set_isr(int vec, struct kvm_lapic *apic) { if (__test_and_set_bit(APIC_VECTOR_TO_BIT_NUMBER(vec), apic_vector_to_isr(vec, apic))) return; /* * With APIC virtualization enabled, all caching is disabled * because the processor can modify ISR under the hood. Instead * just set SVI. */ if (unlikely(apic->apicv_active)) kvm_x86_call(hwapic_isr_update)(apic->vcpu, vec); else { ++apic->isr_count; BUG_ON(apic->isr_count > MAX_APIC_VECTOR); /* * ISR (in service register) bit is set when injecting an interrupt. * The highest vector is injected. Thus the latest bit set matches * the highest bit in ISR. */ apic->highest_isr_cache = vec; } } static inline int apic_find_highest_isr(struct kvm_lapic *apic) { int result; /* * Note that isr_count is always 1, and highest_isr_cache * is always -1, with APIC virtualization enabled. */ if (!apic->isr_count) return -1; if (likely(apic->highest_isr_cache != -1)) return apic->highest_isr_cache; result = apic_find_highest_vector(apic->regs + APIC_ISR); ASSERT(result == -1 || result >= 16); return result; } static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) { if (!__test_and_clear_bit(APIC_VECTOR_TO_BIT_NUMBER(vec), apic_vector_to_isr(vec, apic))) return; /* * We do get here for APIC virtualization enabled if the guest * uses the Hyper-V APIC enlightenment. In this case we may need * to trigger a new interrupt delivery by writing the SVI field; * on the other hand isr_count and highest_isr_cache are unused * and must be left alone. */ if (unlikely(apic->apicv_active)) kvm_x86_call(hwapic_isr_update)(apic->vcpu, apic_find_highest_isr(apic)); else { --apic->isr_count; BUG_ON(apic->isr_count < 0); apic->highest_isr_cache = -1; } } void kvm_apic_update_hwapic_isr(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; if (WARN_ON_ONCE(!lapic_in_kernel(vcpu)) || !apic->apicv_active) return; kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_update_hwapic_isr); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) { /* This may race with setting of irr in __apic_accept_irq() and * value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq * will cause vmexit immediately and the value will be recalculated * on the next vmentry. */ return apic_find_highest_irr(vcpu->arch.apic); } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lapic_find_highest_irr); static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, int vector, int level, int trig_mode, struct dest_map *dest_map); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, struct dest_map *dest_map) { struct kvm_lapic *apic = vcpu->arch.apic; return __apic_accept_irq(apic, irq->delivery_mode, irq->vector, irq->level, irq->trig_mode, dest_map); } static int __pv_send_ipi(unsigned long *ipi_bitmap, struct kvm_apic_map *map, struct kvm_lapic_irq *irq, u32 min) { int i, count = 0; struct kvm_vcpu *vcpu; if (min > map->max_apic_id) return 0; min = array_index_nospec(min, map->max_apic_id + 1); for_each_set_bit(i, ipi_bitmap, min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) { if (map->phys_map[min + i]) { vcpu = map->phys_map[min + i]->vcpu; count += kvm_apic_set_irq(vcpu, irq, NULL); } } return count; } int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, unsigned long ipi_bitmap_high, u32 min, unsigned long icr, int op_64_bit) { struct kvm_apic_map *map; struct kvm_lapic_irq irq = {0}; int cluster_size = op_64_bit ? 64 : 32; int count; if (icr & (APIC_DEST_MASK | APIC_SHORT_MASK)) return -KVM_EINVAL; irq.vector = icr & APIC_VECTOR_MASK; irq.delivery_mode = icr & APIC_MODE_MASK; irq.level = (icr & APIC_INT_ASSERT) != 0; irq.trig_mode = icr & APIC_INT_LEVELTRIG; rcu_read_lock(); map = rcu_dereference(kvm->arch.apic_map); count = -EOPNOTSUPP; if (likely(map)) { count = __pv_send_ipi(&ipi_bitmap_low, map, &irq, min); min += cluster_size; count += __pv_send_ipi(&ipi_bitmap_high, map, &irq, min); } rcu_read_unlock(); return count; } static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) { return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val, sizeof(val)); } static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val) { return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val, sizeof(*val)); } static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu) { return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED; } static void pv_eoi_set_pending(struct kvm_vcpu *vcpu) { if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) return; __set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); } static bool pv_eoi_test_and_clr_pending(struct kvm_vcpu *vcpu) { u8 val; if (pv_eoi_get_user(vcpu, &val) < 0) return false; val &= KVM_PV_EOI_ENABLED; if (val && pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) return false; /* * Clear pending bit in any case: it will be set again on vmentry. * While this might not be ideal from performance point of view, * this makes sure pv eoi is only enabled when we know it's safe. */ __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); return val; } static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr) { int highest_irr; if (kvm_x86_ops.sync_pir_to_irr) highest_irr = kvm_x86_call(sync_pir_to_irr)(apic->vcpu); else highest_irr = apic_find_highest_irr(apic); if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr) return -1; return highest_irr; } static bool __apic_update_ppr(struct kvm_lapic *apic, u32 *new_ppr) { u32 tpr, isrv, ppr, old_ppr; int isr; old_ppr = kvm_lapic_get_reg(apic, APIC_PROCPRI); tpr = kvm_lapic_get_reg(apic, APIC_TASKPRI); isr = apic_find_highest_isr(apic); isrv = (isr != -1) ? isr : 0; if ((tpr & 0xf0) >= (isrv & 0xf0)) ppr = tpr & 0xff; else ppr = isrv & 0xf0; *new_ppr = ppr; if (old_ppr != ppr) kvm_lapic_set_reg(apic, APIC_PROCPRI, ppr); return ppr < old_ppr; } static void apic_update_ppr(struct kvm_lapic *apic) { u32 ppr; if (__apic_update_ppr(apic, &ppr) && apic_has_interrupt_for_ppr(apic, ppr) != -1) kvm_make_request(KVM_REQ_EVENT, apic->vcpu); } void kvm_apic_update_ppr(struct kvm_vcpu *vcpu) { apic_update_ppr(vcpu->arch.apic); } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_update_ppr); static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) { kvm_lapic_set_reg(apic, APIC_TASKPRI, tpr); apic_update_ppr(apic); } static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 mda) { return mda == (apic_x2apic_mode(apic) ? X2APIC_BROADCAST : APIC_BROADCAST); } static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda) { if (kvm_apic_broadcast(apic, mda)) return true; /* * Hotplug hack: Accept interrupts for vCPUs in xAPIC mode as if they * were in x2APIC mode if the target APIC ID can't be encoded as an * xAPIC ID. This allows unique addressing of hotplugged vCPUs (which * start in xAPIC mode) with an APIC ID that is unaddressable in xAPIC * mode. Match the x2APIC ID if and only if the target APIC ID can't * be encoded in xAPIC to avoid spurious matches against a vCPU that * changed its (addressable) xAPIC ID (which is writable). */ if (apic_x2apic_mode(apic) || mda > 0xff) return mda == kvm_x2apic_id(apic); return mda == kvm_xapic_id(apic); } static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) { u32 logical_id; if (kvm_apic_broadcast(apic, mda)) return true; logical_id = kvm_lapic_get_reg(apic, APIC_LDR); if (apic_x2apic_mode(apic)) return ((logical_id >> 16) == (mda >> 16)) && (logical_id & mda & 0xffff) != 0; logical_id = GET_APIC_LOGICAL_ID(logical_id); switch (kvm_lapic_get_reg(apic, APIC_DFR)) { case APIC_DFR_FLAT: return (logical_id & mda) != 0; case APIC_DFR_CLUSTER: return ((logical_id >> 4) == (mda >> 4)) && (logical_id & mda & 0xf) != 0; default: return false; } } /* The KVM local APIC implementation has two quirks: * * - Real hardware delivers interrupts destined to x2APIC ID > 0xff to LAPICs * in xAPIC mode if the "destination & 0xff" matches its xAPIC ID. * KVM doesn't do that aliasing. * * - in-kernel IOAPIC messages have to be delivered directly to * x2APIC, because the kernel does not support interrupt remapping. * In order to support broadcast without interrupt remapping, x2APIC * rewrites the destination of non-IPI messages from APIC_BROADCAST * to X2APIC_BROADCAST. * * The broadcast quirk can be disabled with KVM_CAP_X2APIC_API. This is * important when userspace wants to use x2APIC-format MSIs, because * APIC_BROADCAST (0xff) is a legal route for "cluster 0, CPUs 0-7". */ static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id, struct kvm_lapic *source, struct kvm_lapic *target) { bool ipi = source != NULL; if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled && !ipi && dest_id == APIC_BROADCAST && apic_x2apic_mode(target)) return X2APIC_BROADCAST; return dest_id; } bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int shorthand, unsigned int dest, int dest_mode) { struct kvm_lapic *target = vcpu->arch.apic; u32 mda = kvm_apic_mda(vcpu, dest, source, target); ASSERT(target); switch (shorthand) { case APIC_DEST_NOSHORT: if (dest_mode == APIC_DEST_PHYSICAL) return kvm_apic_match_physical_addr(target, mda); else return kvm_apic_match_logical_addr(target, mda); case APIC_DEST_SELF: return target == source; case APIC_DEST_ALLINC: return true; case APIC_DEST_ALLBUT: return target != source; default: return false; } } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_match_dest); static int kvm_vector_to_index(u32 vector, u32 dest_vcpus, const unsigned long *bitmap, u32 bitmap_size) { int idx = find_nth_bit(bitmap, bitmap_size, vector % dest_vcpus); BUG_ON(idx >= bitmap_size); return idx; } static void kvm_apic_disabled_lapic_found(struct kvm *kvm) { if (!kvm->arch.disabled_lapic_found) { kvm->arch.disabled_lapic_found = true; pr_info("Disabled LAPIC found during irq injection\n"); } } static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src, struct kvm_lapic_irq *irq, struct kvm_apic_map *map) { if (kvm->arch.x2apic_broadcast_quirk_disabled) { if ((irq->dest_id == APIC_BROADCAST && map->logical_mode != KVM_APIC_MODE_X2APIC)) return true; if (irq->dest_id == X2APIC_BROADCAST) return true; } else { bool x2apic_ipi = src && *src && apic_x2apic_mode(*src); if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST)) return true; } return false; } static bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq) { return (irq->delivery_mode == APIC_DM_LOWEST || irq->msi_redir_hint); } static int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) { return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; } /* Return true if the interrupt can be handled by using *bitmap as index mask * for valid destinations in *dst array. * Return false if kvm_apic_map_get_dest_lapic did nothing useful. * Note: we may have zero kvm_lapic destinations when we return true, which * means that the interrupt should be dropped. In this case, *bitmap would be * zero and *dst undefined. */ static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm, struct kvm_lapic **src, struct kvm_lapic_irq *irq, struct kvm_apic_map *map, struct kvm_lapic ***dst, unsigned long *bitmap) { int i, lowest; if (irq->shorthand == APIC_DEST_SELF && src) { *dst = src; *bitmap = 1; return true; } else if (irq->shorthand) return false; if (!map || kvm_apic_is_broadcast_dest(kvm, src, irq, map)) return false; if (irq->dest_mode == APIC_DEST_PHYSICAL) { if (irq->dest_id > map->max_apic_id) { *bitmap = 0; } else { u32 dest_id = array_index_nospec(irq->dest_id, map->max_apic_id + 1); *dst = &map->phys_map[dest_id]; *bitmap = 1; } return true; } *bitmap = 0; if (!kvm_apic_map_get_logical_dest(map, irq->dest_id, dst, (u16 *)bitmap)) return false; if (!kvm_lowest_prio_delivery(irq)) return true; if (!vector_hashing_enabled) { lowest = -1; for_each_set_bit(i, bitmap, 16) { if (!(*dst)[i]) continue; if (lowest < 0) lowest = i; else if (kvm_apic_compare_prio((*dst)[i]->vcpu, (*dst)[lowest]->vcpu) < 0) lowest = i; } } else { if (!*bitmap) return true; lowest = kvm_vector_to_index(irq->vector, hweight16(*bitmap), bitmap, 16); if (!(*dst)[lowest]) { kvm_apic_disabled_lapic_found(kvm); *bitmap = 0; return true; } } *bitmap = (lowest >= 0) ? 1 << lowest : 0; return true; } bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map) { struct kvm_apic_map *map; unsigned long bitmap; struct kvm_lapic **dst = NULL; int i; bool ret; *r = -1; if (irq->shorthand == APIC_DEST_SELF) { if (KVM_BUG_ON(!src, kvm)) { *r = 0; return true; } *r = kvm_apic_set_irq(src->vcpu, irq, dest_map); return true; } rcu_read_lock(); map = rcu_dereference(kvm->arch.apic_map); ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dst, &bitmap); if (ret) { *r = 0; for_each_set_bit(i, &bitmap, 16) { if (!dst[i]) continue; *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map); } } rcu_read_unlock(); return ret; } /* * This routine tries to handle interrupts in posted mode, here is how * it deals with different cases: * - For single-destination interrupts, handle it in posted mode * - Else if vector hashing is enabled and it is a lowest-priority * interrupt, handle it in posted mode and use the following mechanism * to find the destination vCPU. * 1. For lowest-priority interrupts, store all the possible * destination vCPUs in an array. * 2. Use "guest vector % max number of destination vCPUs" to find * the right destination vCPU in the array for the lowest-priority * interrupt. * - Otherwise, use remapped mode to inject the interrupt. */ static bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu) { struct kvm_apic_map *map; unsigned long bitmap; struct kvm_lapic **dst = NULL; bool ret = false; if (irq->shorthand) return false; rcu_read_lock(); map = rcu_dereference(kvm->arch.apic_map); if (kvm_apic_map_get_dest_lapic(kvm, NULL, irq, map, &dst, &bitmap) && hweight16(bitmap) == 1) { unsigned long i = find_first_bit(&bitmap, 16); if (dst[i]) { *dest_vcpu = dst[i]->vcpu; ret = true; } } rcu_read_unlock(); return ret; } bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu) { int r = 0; unsigned long i; struct kvm_vcpu *vcpu; if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu)) return true; kvm_for_each_vcpu(i, vcpu, kvm) { if (!kvm_apic_present(vcpu)) continue; if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand, irq->dest_id, irq->dest_mode)) continue; if (++r == 2) return false; *dest_vcpu = vcpu; } return r == 1; } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_intr_is_single_vcpu); int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, struct dest_map *dest_map) { int r = -1; struct kvm_vcpu *vcpu, *lowest = NULL; unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]; unsigned int dest_vcpus = 0; if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) return r; if (irq->dest_mode == APIC_DEST_PHYSICAL && irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) { pr_info("apic: phys broadcast and lowest prio\n"); irq->delivery_mode = APIC_DM_FIXED; } memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap)); kvm_for_each_vcpu(i, vcpu, kvm) { if (!kvm_apic_present(vcpu)) continue; if (!kvm_apic_match_dest(vcpu, src, irq->shorthand, irq->dest_id, irq->dest_mode)) continue; if (!kvm_lowest_prio_delivery(irq)) { if (r < 0) r = 0; r += kvm_apic_set_irq(vcpu, irq, dest_map); } else if (kvm_apic_sw_enabled(vcpu->arch.apic)) { if (!vector_hashing_enabled) { if (!lowest) lowest = vcpu; else if (kvm_apic_compare_prio(vcpu, lowest) < 0) lowest = vcpu; } else { __set_bit(i, dest_vcpu_bitmap); dest_vcpus++; } } } if (dest_vcpus != 0) { int idx = kvm_vector_to_index(irq->vector, dest_vcpus, dest_vcpu_bitmap, KVM_MAX_VCPUS); lowest = kvm_get_vcpu(kvm, idx); } if (lowest) r = kvm_apic_set_irq(lowest, irq, dest_map); return r; } /* * Add a pending IRQ into lapic. * Return 1 if successfully added and 0 if discarded. */ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, int vector, int level, int trig_mode, struct dest_map *dest_map) { int result = 0; struct kvm_vcpu *vcpu = apic->vcpu; trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector); switch (delivery_mode) { case APIC_DM_LOWEST: vcpu->arch.apic_arb_prio++; fallthrough; case APIC_DM_FIXED: if (unlikely(trig_mode && !level)) break; /* FIXME add logic for vcpu on reset */ if (unlikely(!apic_enabled(apic))) break; result = 1; if (dest_map) { __set_bit(vcpu->vcpu_id, dest_map->map); dest_map->vectors[vcpu->vcpu_id] = vector; } if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) { if (trig_mode) apic_set_vector(vector, apic->regs + APIC_TMR); else apic_clear_vector(vector, apic->regs + APIC_TMR); } kvm_x86_call(deliver_interrupt)(apic, delivery_mode, trig_mode, vector); break; case APIC_DM_REMRD: result = 1; vcpu->arch.pv.pv_unhalted = 1; kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); break; case APIC_DM_SMI: if (!kvm_inject_smi(vcpu)) { kvm_vcpu_kick(vcpu); result = 1; } break; case APIC_DM_NMI: result = 1; kvm_inject_nmi(vcpu); kvm_vcpu_kick(vcpu); break; case APIC_DM_INIT: if (!trig_mode || level) { result = 1; /* assumes that there are only KVM_APIC_INIT/SIPI */ apic->pending_events = (1UL << KVM_APIC_INIT); kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); } break; case APIC_DM_STARTUP: result = 1; apic->sipi_vector = vector; /* make sure sipi_vector is visible for the receiver */ smp_wmb(); set_bit(KVM_APIC_SIPI, &apic->pending_events); kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); break; case APIC_DM_EXTINT: /* * Should only be called by kvm_apic_local_deliver() with LVT0, * before NMI watchdog was enabled. Already handled by * kvm_apic_accept_pic_intr(). */ break; default: printk(KERN_ERR "TODO: unsupported delivery mode %x\n", delivery_mode); break; } return result; } /* * This routine identifies the destination vcpus mask meant to receive the * IOAPIC interrupts. It either uses kvm_apic_map_get_dest_lapic() to find * out the destination vcpus array and set the bitmap or it traverses to * each available vcpu to identify the same. */ void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq, unsigned long *vcpu_bitmap) { struct kvm_lapic **dest_vcpu = NULL; struct kvm_lapic *src = NULL; struct kvm_apic_map *map; struct kvm_vcpu *vcpu; unsigned long bitmap, i; int vcpu_idx; bool ret; rcu_read_lock(); map = rcu_dereference(kvm->arch.apic_map); ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dest_vcpu, &bitmap); if (ret) { for_each_set_bit(i, &bitmap, 16) { if (!dest_vcpu[i]) continue; vcpu_idx = dest_vcpu[i]->vcpu->vcpu_idx; __set_bit(vcpu_idx, vcpu_bitmap); } } else { kvm_for_each_vcpu(i, vcpu, kvm) { if (!kvm_apic_present(vcpu)) continue; if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand, irq->dest_id, irq->dest_mode)) continue; __set_bit(i, vcpu_bitmap); } } rcu_read_unlock(); } static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector) { return test_bit(vector, apic->vcpu->arch.ioapic_handled_vectors); } static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) { int __maybe_unused trigger_mode; /* Eoi the ioapic only if the ioapic doesn't own the vector. */ if (!kvm_ioapic_handles_vector(apic, vector)) return; /* * If the intercepted EOI is for an IRQ that was pending from previous * routing, then re-scan the I/O APIC routes as EOIs for the IRQ likely * no longer need to be intercepted. */ if (apic->vcpu->arch.highest_stale_pending_ioapic_eoi == vector) kvm_make_request(KVM_REQ_SCAN_IOAPIC, apic->vcpu); /* Request a KVM exit to inform the userspace IOAPIC. */ if (irqchip_split(apic->vcpu->kvm)) { apic->vcpu->arch.pending_ioapic_eoi = vector; kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu); return; } #ifdef CONFIG_KVM_IOAPIC if (apic_test_vector(vector, apic->regs + APIC_TMR)) trigger_mode = IOAPIC_LEVEL_TRIG; else trigger_mode = IOAPIC_EDGE_TRIG; kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode); #endif } static int apic_set_eoi(struct kvm_lapic *apic) { int vector = apic_find_highest_isr(apic); trace_kvm_eoi(apic, vector); /* * Not every write EOI will has corresponding ISR, * one example is when Kernel check timer on setup_IO_APIC */ if (vector == -1) return vector; apic_clear_isr(vector, apic); apic_update_ppr(apic); if (kvm_hv_synic_has_vector(apic->vcpu, vector)) kvm_hv_synic_send_eoi(apic->vcpu, vector); kvm_ioapic_send_eoi(apic, vector); kvm_make_request(KVM_REQ_EVENT, apic->vcpu); return vector; } /* * this interface assumes a trap-like exit, which has already finished * desired side effect including vISR and vPPR update. */ void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector) { struct kvm_lapic *apic = vcpu->arch.apic; trace_kvm_eoi(apic, vector); kvm_ioapic_send_eoi(apic, vector); kvm_make_request(KVM_REQ_EVENT, apic->vcpu); } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_set_eoi_accelerated); static void kvm_icr_to_lapic_irq(struct kvm_lapic *apic, u32 icr_low, u32 icr_high, struct kvm_lapic_irq *irq) { /* KVM has no delay and should always clear the BUSY/PENDING flag. */ WARN_ON_ONCE(icr_low & APIC_ICR_BUSY); irq->vector = icr_low & APIC_VECTOR_MASK; irq->delivery_mode = icr_low & APIC_MODE_MASK; irq->dest_mode = icr_low & APIC_DEST_MASK; irq->level = (icr_low & APIC_INT_ASSERT) != 0; irq->trig_mode = icr_low & APIC_INT_LEVELTRIG; irq->shorthand = icr_low & APIC_SHORT_MASK; irq->msi_redir_hint = false; if (apic_x2apic_mode(apic)) irq->dest_id = icr_high; else irq->dest_id = GET_XAPIC_DEST_FIELD(icr_high); } void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high) { struct kvm_lapic_irq irq; kvm_icr_to_lapic_irq(apic, icr_low, icr_high, &irq); trace_kvm_apic_ipi(icr_low, irq.dest_id); kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL); } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_send_ipi); static u32 apic_get_tmcct(struct kvm_lapic *apic) { ktime_t remaining, now; s64 ns; ASSERT(apic != NULL); /* if initial count is 0, current count should also be 0 */ if (kvm_lapic_get_reg(apic, APIC_TMICT) == 0 || apic->lapic_timer.period == 0) return 0; now = ktime_get(); remaining = ktime_sub(apic->lapic_timer.target_expiration, now); if (ktime_to_ns(remaining) < 0) remaining = 0; ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period); return div64_u64(ns, (apic->vcpu->kvm->arch.apic_bus_cycle_ns * apic->divide_count)); } static void __report_tpr_access(struct kvm_lapic *apic, bool write) { struct kvm_vcpu *vcpu = apic->vcpu; struct kvm_run *run = vcpu->run; kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu); run->tpr_access.rip = kvm_rip_read(vcpu); run->tpr_access.is_write = write; } static inline void report_tpr_access(struct kvm_lapic *apic, bool write) { if (apic->vcpu->arch.tpr_access_reporting) __report_tpr_access(apic, write); } static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) { u32 val = 0; if (offset >= LAPIC_MMIO_LENGTH) return 0; switch (offset) { case APIC_ARBPRI: break; case APIC_TMCCT: /* Timer CCR */ if (apic_lvtt_tscdeadline(apic)) return 0; val = apic_get_tmcct(apic); break; case APIC_PROCPRI: apic_update_ppr(apic); val = kvm_lapic_get_reg(apic, offset); break; case APIC_TASKPRI: report_tpr_access(apic, false); fallthrough; default: val = kvm_lapic_get_reg(apic, offset); break; } return val; } static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev) { return container_of(dev, struct kvm_lapic, dev); } #define APIC_REG_MASK(reg) (1ull << ((reg) >> 4)) #define APIC_REGS_MASK(first, count) \ (APIC_REG_MASK(first) * ((1ull << (count)) - 1)) u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic) { /* Leave bits '0' for reserved and write-only registers. */ u64 valid_reg_mask = APIC_REG_MASK(APIC_ID) | APIC_REG_MASK(APIC_LVR) | APIC_REG_MASK(APIC_TASKPRI) | APIC_REG_MASK(APIC_PROCPRI) | APIC_REG_MASK(APIC_LDR) | APIC_REG_MASK(APIC_SPIV) | APIC_REGS_MASK(APIC_ISR, APIC_ISR_NR) | APIC_REGS_MASK(APIC_TMR, APIC_ISR_NR) | APIC_REGS_MASK(APIC_IRR, APIC_ISR_NR) | APIC_REG_MASK(APIC_ESR) | APIC_REG_MASK(APIC_ICR) | APIC_REG_MASK(APIC_LVTT) | APIC_REG_MASK(APIC_LVTTHMR) | APIC_REG_MASK(APIC_LVTPC) | APIC_REG_MASK(APIC_LVT0) | APIC_REG_MASK(APIC_LVT1) | APIC_REG_MASK(APIC_LVTERR) | APIC_REG_MASK(APIC_TMICT) | APIC_REG_MASK(APIC_TMCCT) | APIC_REG_MASK(APIC_TDCR); if (kvm_lapic_lvt_supported(apic, LVT_CMCI)) valid_reg_mask |= APIC_REG_MASK(APIC_LVTCMCI); /* ARBPRI, DFR, and ICR2 are not valid in x2APIC mode. */ if (!apic_x2apic_mode(apic)) valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI) | APIC_REG_MASK(APIC_DFR) | APIC_REG_MASK(APIC_ICR2); return valid_reg_mask; } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lapic_readable_reg_mask); static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, void *data) { unsigned char alignment = offset & 0xf; u32 result; /* * WARN if KVM reads ICR in x2APIC mode, as it's an 8-byte register in * x2APIC and needs to be manually handled by the caller. */ WARN_ON_ONCE(apic_x2apic_mode(apic) && offset == APIC_ICR); if (alignment + len > 4) return 1; if (offset > 0x3f0 || !(kvm_lapic_readable_reg_mask(apic) & APIC_REG_MASK(offset))) return 1; result = __apic_read(apic, offset & ~0xf); trace_kvm_apic_read(offset, result); switch (len) { case 1: case 2: case 4: memcpy(data, (char *)&result + alignment, len); break; default: printk(KERN_ERR "Local APIC read with len = %x, " "should be 1,2, or 4 instead\n", len); break; } return 0; } static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr) { return addr >= apic->base_address && addr < apic->base_address + LAPIC_MMIO_LENGTH; } static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t address, int len, void *data) { struct kvm_lapic *apic = to_lapic(this); u32 offset = address - apic->base_address; if (!apic_mmio_in_range(apic, address)) return -EOPNOTSUPP; if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) { if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LAPIC_MMIO_HOLE)) return -EOPNOTSUPP; memset(data, 0xff, len); return 0; } kvm_lapic_reg_read(apic, offset, len, data); return 0; } static void update_divide_count(struct kvm_lapic *apic) { u32 tmp1, tmp2, tdcr; tdcr = kvm_lapic_get_reg(apic, APIC_TDCR); tmp1 = tdcr & 0xf; tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1; apic->divide_count = 0x1 << (tmp2 & 0x7); } static void limit_periodic_timer_frequency(struct kvm_lapic *apic) { /* * Do not allow the guest to program periodic timers with small * interval, since the hrtimers are not throttled by the host * scheduler. */ if (apic_lvtt_period(apic) && apic->lapic_timer.period) { s64 min_period = min_timer_period_us * 1000LL; if (apic->lapic_timer.period < min_period) { pr_info_once( "vcpu %i: requested %lld ns " "lapic timer period limited to %lld ns\n", apic->vcpu->vcpu_id, apic->lapic_timer.period, min_period); apic->lapic_timer.period = min_period; } } } static void cancel_hv_timer(struct kvm_lapic *apic); static void cancel_apic_timer(struct kvm_lapic *apic) { hrtimer_cancel(&apic->lapic_timer.timer); preempt_disable(); if (apic->lapic_timer.hv_timer_in_use) cancel_hv_timer(apic); preempt_enable(); atomic_set(&apic->lapic_timer.pending, 0); } static void apic_update_lvtt(struct kvm_lapic *apic) { u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) & apic->lapic_timer.timer_mode_mask; if (apic->lapic_timer.timer_mode != timer_mode) { if (apic_lvtt_tscdeadline(apic) != (timer_mode == APIC_LVT_TIMER_TSCDEADLINE)) { cancel_apic_timer(apic); kvm_lapic_set_reg(apic, APIC_TMICT, 0); apic->lapic_timer.period = 0; apic->lapic_timer.tscdeadline = 0; } apic->lapic_timer.timer_mode = timer_mode; limit_periodic_timer_frequency(apic); } } /* * On APICv, this test will cause a busy wait * during a higher-priority task. */ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; u32 reg; /* * Assume a timer IRQ was "injected" if the APIC is protected. KVM's * copy of the vIRR is bogus, it's the responsibility of the caller to * precisely check whether or not a timer IRQ is pending. */ if (apic->guest_apic_protected) return true; reg = kvm_lapic_get_reg(apic, APIC_LVTT); if (kvm_apic_hw_enabled(apic)) { int vec = reg & APIC_VECTOR_MASK; void *bitmap = apic->regs + APIC_ISR; if (apic->apicv_active) bitmap = apic->regs + APIC_IRR; if (apic_test_vector(vec, bitmap)) return true; } return false; } static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles) { u64 timer_advance_ns = vcpu->arch.apic->lapic_timer.timer_advance_ns; /* * If the guest TSC is running at a different ratio than the host, then * convert the delay to nanoseconds to achieve an accurate delay. Note * that __delay() uses delay_tsc whenever the hardware has TSC, thus * always for VMX enabled hardware. */ if (vcpu->arch.tsc_scaling_ratio == kvm_caps.default_tsc_scaling_ratio) { __delay(min(guest_cycles, nsec_to_cycles(vcpu, timer_advance_ns))); } else { u64 delay_ns = guest_cycles * 1000000ULL; do_div(delay_ns, vcpu->arch.virtual_tsc_khz); ndelay(min_t(u32, delay_ns, timer_advance_ns)); } } static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu, s64 advance_expire_delta) { struct kvm_lapic *apic = vcpu->arch.apic; u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns; u64 ns; /* Do not adjust for tiny fluctuations or large random spikes. */ if (abs(advance_expire_delta) > LAPIC_TIMER_ADVANCE_ADJUST_MAX || abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_MIN) return; /* too early */ if (advance_expire_delta < 0) { ns = -advance_expire_delta * 1000000ULL; do_div(ns, vcpu->arch.virtual_tsc_khz); timer_advance_ns -= ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP; } else { /* too late */ ns = advance_expire_delta * 1000000ULL; do_div(ns, vcpu->arch.virtual_tsc_khz); timer_advance_ns += ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP; } if (unlikely(timer_advance_ns > LAPIC_TIMER_ADVANCE_NS_MAX)) timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT; apic->lapic_timer.timer_advance_ns = timer_advance_ns; } static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; u64 guest_tsc, tsc_deadline; tsc_deadline = apic->lapic_timer.expired_tscdeadline; apic->lapic_timer.expired_tscdeadline = 0; guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline); adjust_lapic_timer_advance(vcpu, guest_tsc - tsc_deadline); /* * If the timer fired early, reread the TSC to account for the overhead * of the above adjustment to avoid waiting longer than is necessary. */ if (guest_tsc < tsc_deadline) guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); if (guest_tsc < tsc_deadline) __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc); } void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) { if (lapic_in_kernel(vcpu) && vcpu->arch.apic->lapic_timer.expired_tscdeadline && vcpu->arch.apic->lapic_timer.timer_advance_ns && lapic_timer_int_injected(vcpu)) __kvm_wait_lapic_expire(vcpu); } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_wait_lapic_expire); static void kvm_apic_inject_pending_timer_irqs(struct kvm_lapic *apic) { struct kvm_timer *ktimer = &apic->lapic_timer; kvm_apic_local_deliver(apic, APIC_LVTT); if (apic_lvtt_tscdeadline(apic)) { ktimer->tscdeadline = 0; } else if (apic_lvtt_oneshot(apic)) { ktimer->tscdeadline = 0; ktimer->target_expiration = 0; } } static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn) { struct kvm_vcpu *vcpu = apic->vcpu; struct kvm_timer *ktimer = &apic->lapic_timer; if (atomic_read(&apic->lapic_timer.pending)) return; if (apic_lvtt_tscdeadline(apic) || ktimer->hv_timer_in_use) ktimer->expired_tscdeadline = ktimer->tscdeadline; if (!from_timer_fn && apic->apicv_active) { WARN_ON(kvm_get_running_vcpu() != vcpu); kvm_apic_inject_pending_timer_irqs(apic); return; } if (kvm_use_posted_timer_interrupt(apic->vcpu)) { /* * Ensure the guest's timer has truly expired before posting an * interrupt. Open code the relevant checks to avoid querying * lapic_timer_int_injected(), which will be false since the * interrupt isn't yet injected. Waiting until after injecting * is not an option since that won't help a posted interrupt. */ if (vcpu->arch.apic->lapic_timer.expired_tscdeadline && vcpu->arch.apic->lapic_timer.timer_advance_ns) __kvm_wait_lapic_expire(vcpu); kvm_apic_inject_pending_timer_irqs(apic); return; } atomic_inc(&apic->lapic_timer.pending); kvm_make_request(KVM_REQ_UNBLOCK, vcpu); if (from_timer_fn) kvm_vcpu_kick(vcpu); } static void start_sw_tscdeadline(struct kvm_lapic *apic) { struct kvm_timer *ktimer = &apic->lapic_timer; u64 guest_tsc, tscdeadline = ktimer->tscdeadline; u64 ns = 0; ktime_t expire; struct kvm_vcpu *vcpu = apic->vcpu; u32 this_tsc_khz = vcpu->arch.virtual_tsc_khz; unsigned long flags; ktime_t now; if (unlikely(!tscdeadline || !this_tsc_khz)) return; local_irq_save(flags); now = ktime_get(); guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); ns = (tscdeadline - guest_tsc) * 1000000ULL; do_div(ns, this_tsc_khz); if (likely(tscdeadline > guest_tsc) && likely(ns > apic->lapic_timer.timer_advance_ns)) { expire = ktime_add_ns(now, ns); expire = ktime_sub_ns(expire, ktimer->timer_advance_ns); hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS_HARD); } else apic_timer_expired(apic, false); local_irq_restore(flags); } static inline u64 tmict_to_ns(struct kvm_lapic *apic, u32 tmict) { return (u64)tmict * apic->vcpu->kvm->arch.apic_bus_cycle_ns * (u64)apic->divide_count; } static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor) { ktime_t now, remaining; u64 ns_remaining_old, ns_remaining_new; apic->lapic_timer.period = tmict_to_ns(apic, kvm_lapic_get_reg(apic, APIC_TMICT)); limit_periodic_timer_frequency(apic); now = ktime_get(); remaining = ktime_sub(apic->lapic_timer.target_expiration, now); if (ktime_to_ns(remaining) < 0) remaining = 0; ns_remaining_old = ktime_to_ns(remaining); ns_remaining_new = mul_u64_u32_div(ns_remaining_old, apic->divide_count, old_divisor); apic->lapic_timer.tscdeadline += nsec_to_cycles(apic->vcpu, ns_remaining_new) - nsec_to_cycles(apic->vcpu, ns_remaining_old); apic->lapic_timer.target_expiration = ktime_add_ns(now, ns_remaining_new); } static bool set_target_expiration(struct kvm_lapic *apic, u32 count_reg) { ktime_t now; u64 tscl = rdtsc(); s64 deadline; now = ktime_get(); apic->lapic_timer.period = tmict_to_ns(apic, kvm_lapic_get_reg(apic, APIC_TMICT)); if (!apic->lapic_timer.period) { apic->lapic_timer.tscdeadline = 0; return false; } limit_periodic_timer_frequency(apic); deadline = apic->lapic_timer.period; if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { if (unlikely(count_reg != APIC_TMICT)) { deadline = tmict_to_ns(apic, kvm_lapic_get_reg(apic, count_reg)); if (unlikely(deadline <= 0)) { if (apic_lvtt_period(apic)) deadline = apic->lapic_timer.period; else deadline = 0; } else if (unlikely(deadline > apic->lapic_timer.period)) { pr_info_ratelimited( "vcpu %i: requested lapic timer restore with " "starting count register %#x=%u (%lld ns) > initial count (%lld ns). " "Using initial count to start timer.\n", apic->vcpu->vcpu_id, count_reg, kvm_lapic_get_reg(apic, count_reg), deadline, apic->lapic_timer.period); kvm_lapic_set_reg(apic, count_reg, 0); deadline = apic->lapic_timer.period; } } } apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) + nsec_to_cycles(apic->vcpu, deadline); apic->lapic_timer.target_expiration = ktime_add_ns(now, deadline); return true; } static void advance_periodic_target_expiration(struct kvm_lapic *apic) { struct kvm_timer *ktimer = &apic->lapic_timer; ktime_t now = ktime_get(); u64 tscl = rdtsc(); ktime_t delta; /* * Use kernel time as the time source for both the hrtimer deadline and * TSC-based deadline so that they stay synchronized. Computing each * deadline independently will cause the two deadlines to drift apart * over time as differences in the periods accumulate, e.g. due to * differences in the underlying clocks or numerical approximation errors. */ ktimer->target_expiration = ktime_add_ns(ktimer->target_expiration, ktimer->period); /* * If the new expiration is in the past, e.g. because userspace stopped * running the VM for an extended duration, then force the expiration * to "now" and don't try to play catch-up with the missed events. KVM * will only deliver a single interrupt regardless of how many events * are pending, i.e. restarting the timer with an expiration in the * past will do nothing more than waste host cycles, and can even lead * to a hard lockup in extreme cases. */ if (ktime_before(ktimer->target_expiration, now)) ktimer->target_expiration = now; /* * Note, ensuring the expiration isn't in the past also prevents delta * from going negative, which could cause the TSC deadline to become * excessively large due to it an unsigned value. */ delta = ktime_sub(ktimer->target_expiration, now); ktimer->tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) + nsec_to_cycles(apic->vcpu, delta); } static void start_sw_period(struct kvm_lapic *apic) { if (!apic->lapic_timer.period) return; if (ktime_after(ktime_get(), apic->lapic_timer.target_expiration)) { apic_timer_expired(apic, false); if (apic_lvtt_oneshot(apic)) return; advance_periodic_target_expiration(apic); } hrtimer_start(&apic->lapic_timer.timer, apic->lapic_timer.target_expiration, HRTIMER_MODE_ABS_HARD); } bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu) { if (!lapic_in_kernel(vcpu)) return false; return vcpu->arch.apic->lapic_timer.hv_timer_in_use; } static void cancel_hv_timer(struct kvm_lapic *apic) { WARN_ON(preemptible()); WARN_ON(!apic->lapic_timer.hv_timer_in_use); kvm_x86_call(cancel_hv_timer)(apic->vcpu); apic->lapic_timer.hv_timer_in_use = false; } static bool start_hv_timer(struct kvm_lapic *apic) { struct kvm_timer *ktimer = &apic->lapic_timer; struct kvm_vcpu *vcpu = apic->vcpu; bool expired; WARN_ON(preemptible()); if (!kvm_can_use_hv_timer(vcpu)) return false; if (!ktimer->tscdeadline) return false; if (kvm_x86_call(set_hv_timer)(vcpu, ktimer->tscdeadline, &expired)) return false; ktimer->hv_timer_in_use = true; hrtimer_cancel(&ktimer->timer); /* * To simplify handling the periodic timer, leave the hv timer running * even if the deadline timer has expired, i.e. rely on the resulting * VM-Exit to recompute the periodic timer's target expiration. */ if (!apic_lvtt_period(apic)) { /* * Cancel the hv timer if the sw timer fired while the hv timer * was being programmed, or if the hv timer itself expired. */ if (atomic_read(&ktimer->pending)) { cancel_hv_timer(apic); } else if (expired) { apic_timer_expired(apic, false); cancel_hv_timer(apic); } } trace_kvm_hv_timer_state(vcpu->vcpu_id, ktimer->hv_timer_in_use); return true; } static void start_sw_timer(struct kvm_lapic *apic) { struct kvm_timer *ktimer = &apic->lapic_timer; WARN_ON(preemptible()); if (apic->lapic_timer.hv_timer_in_use) cancel_hv_timer(apic); if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending)) return; if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) start_sw_period(apic); else if (apic_lvtt_tscdeadline(apic)) start_sw_tscdeadline(apic); trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, false); } static void restart_apic_timer(struct kvm_lapic *apic) { preempt_disable(); if (!apic_lvtt_period(apic) && atomic_read(&apic->lapic_timer.pending)) goto out; if (!start_hv_timer(apic)) start_sw_timer(apic); out: preempt_enable(); } void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; preempt_disable(); /* If the preempt notifier has already run, it also called apic_timer_expired */ if (!apic->lapic_timer.hv_timer_in_use) goto out; WARN_ON(kvm_vcpu_is_blocking(vcpu)); apic_timer_expired(apic, false); cancel_hv_timer(apic); if (apic_lvtt_period(apic) && apic->lapic_timer.period) { advance_periodic_target_expiration(apic); restart_apic_timer(apic); } out: preempt_enable(); } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lapic_expired_hv_timer); void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu) { restart_apic_timer(vcpu->arch.apic); } void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; preempt_disable(); /* Possibly the TSC deadline timer is not enabled yet */ if (apic->lapic_timer.hv_timer_in_use) start_sw_timer(apic); preempt_enable(); } void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; WARN_ON(!apic->lapic_timer.hv_timer_in_use); restart_apic_timer(apic); } static void __start_apic_timer(struct kvm_lapic *apic, u32 count_reg) { atomic_set(&apic->lapic_timer.pending, 0); if ((apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) && !set_target_expiration(apic, count_reg)) return; restart_apic_timer(apic); } static void start_apic_timer(struct kvm_lapic *apic) { __start_apic_timer(apic, APIC_TMICT); } static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) { bool lvt0_in_nmi_mode = apic_lvt_nmi_mode(lvt0_val); if (apic->lvt0_in_nmi_mode != lvt0_in_nmi_mode) { apic->lvt0_in_nmi_mode = lvt0_in_nmi_mode; if (lvt0_in_nmi_mode) { atomic_inc(&apic->vcpu->kvm->arch.vapics_in_nmi_mode); } else atomic_dec(&apic->vcpu->kvm->arch.vapics_in_nmi_mode); } } static int get_lvt_index(u32 reg) { if (reg == APIC_LVTCMCI) return LVT_CMCI; if (reg < APIC_LVTT || reg > APIC_LVTERR) return -1; return array_index_nospec( (reg - APIC_LVTT) >> 4, KVM_APIC_MAX_NR_LVT_ENTRIES); } static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) { int ret = 0; trace_kvm_apic_write(reg, val); switch (reg) { case APIC_ID: /* Local APIC ID */ if (!apic_x2apic_mode(apic)) { kvm_apic_set_xapic_id(apic, val >> 24); } else { ret = 1; } break; case APIC_TASKPRI: report_tpr_access(apic, true); apic_set_tpr(apic, val & 0xff); break; case APIC_EOI: apic_set_eoi(apic); break; case APIC_LDR: if (!apic_x2apic_mode(apic)) kvm_apic_set_ldr(apic, val & APIC_LDR_MASK); else ret = 1; break; case APIC_DFR: if (!apic_x2apic_mode(apic)) kvm_apic_set_dfr(apic, val | 0x0FFFFFFF); else ret = 1; break; case APIC_SPIV: { u32 mask = 0x3ff; if (kvm_lapic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI) mask |= APIC_SPIV_DIRECTED_EOI; apic_set_spiv(apic, val & mask); if (!(val & APIC_SPIV_APIC_ENABLED)) { int i; for (i = 0; i < apic->nr_lvt_entries; i++) { kvm_lapic_set_reg(apic, APIC_LVTx(i), kvm_lapic_get_reg(apic, APIC_LVTx(i)) | APIC_LVT_MASKED); } apic_update_lvtt(apic); atomic_set(&apic->lapic_timer.pending, 0); } break; } case APIC_ICR: WARN_ON_ONCE(apic_x2apic_mode(apic)); /* No delay here, so we always clear the pending bit */ val &= ~APIC_ICR_BUSY; kvm_apic_send_ipi(apic, val, kvm_lapic_get_reg(apic, APIC_ICR2)); kvm_lapic_set_reg(apic, APIC_ICR, val); break; case APIC_ICR2: if (apic_x2apic_mode(apic)) ret = 1; else kvm_lapic_set_reg(apic, APIC_ICR2, val & 0xff000000); break; case APIC_LVT0: apic_manage_nmi_watchdog(apic, val); fallthrough; case APIC_LVTTHMR: case APIC_LVTPC: case APIC_LVT1: case APIC_LVTERR: case APIC_LVTCMCI: { u32 index = get_lvt_index(reg); if (!kvm_lapic_lvt_supported(apic, index)) { ret = 1; break; } if (!kvm_apic_sw_enabled(apic)) val |= APIC_LVT_MASKED; val &= apic_lvt_mask[index]; kvm_lapic_set_reg(apic, reg, val); break; } case APIC_LVTT: if (!kvm_apic_sw_enabled(apic)) val |= APIC_LVT_MASKED; val &= (apic_lvt_mask[LVT_TIMER] | apic->lapic_timer.timer_mode_mask); kvm_lapic_set_reg(apic, APIC_LVTT, val); apic_update_lvtt(apic); break; case APIC_TMICT: if (apic_lvtt_tscdeadline(apic)) break; cancel_apic_timer(apic); kvm_lapic_set_reg(apic, APIC_TMICT, val); start_apic_timer(apic); break; case APIC_TDCR: { uint32_t old_divisor = apic->divide_count; kvm_lapic_set_reg(apic, APIC_TDCR, val & 0xb); update_divide_count(apic); if (apic->divide_count != old_divisor && apic->lapic_timer.period) { hrtimer_cancel(&apic->lapic_timer.timer); update_target_expiration(apic, old_divisor); restart_apic_timer(apic); } break; } case APIC_ESR: if (apic_x2apic_mode(apic) && val != 0) ret = 1; break; case APIC_SELF_IPI: /* * Self-IPI exists only when x2APIC is enabled. Bits 7:0 hold * the vector, everything else is reserved. */ if (!apic_x2apic_mode(apic) || (val & ~APIC_VECTOR_MASK)) ret = 1; else kvm_apic_send_ipi(apic, APIC_DEST_SELF | val, 0); break; default: ret = 1; break; } /* * Recalculate APIC maps if necessary, e.g. if the software enable bit * was toggled, the APIC ID changed, etc... The maps are marked dirty * on relevant changes, i.e. this is a nop for most writes. */ kvm_recalculate_apic_map(apic->vcpu->kvm); return ret; } static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t address, int len, const void *data) { struct kvm_lapic *apic = to_lapic(this); unsigned int offset = address - apic->base_address; u32 val; if (!apic_mmio_in_range(apic, address)) return -EOPNOTSUPP; if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) { if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LAPIC_MMIO_HOLE)) return -EOPNOTSUPP; return 0; } /* * APIC register must be aligned on 128-bits boundary. * 32/64/128 bits registers must be accessed thru 32 bits. * Refer SDM 8.4.1 */ if (len != 4 || (offset & 0xf)) return 0; val = *(u32*)data; kvm_lapic_reg_write(apic, offset & 0xff0, val); return 0; } void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu) { kvm_lapic_reg_write(vcpu->arch.apic, APIC_EOI, 0); } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lapic_set_eoi); #define X2APIC_ICR_RESERVED_BITS (GENMASK_ULL(31, 20) | GENMASK_ULL(17, 16) | BIT(13)) static int __kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data, bool fast) { if (data & X2APIC_ICR_RESERVED_BITS) return 1; /* * The BUSY bit is reserved on both Intel and AMD in x2APIC mode, but * only AMD requires it to be zero, Intel essentially just ignores the * bit. And if IPI virtualization (Intel) or x2AVIC (AMD) is enabled, * the CPU performs the reserved bits checks, i.e. the underlying CPU * behavior will "win". Arbitrarily clear the BUSY bit, as there is no * sane way to provide consistent behavior with respect to hardware. */ data &= ~APIC_ICR_BUSY; if (fast) { struct kvm_lapic_irq irq; int ignored; kvm_icr_to_lapic_irq(apic, (u32)data, (u32)(data >> 32), &irq); if (!kvm_irq_delivery_to_apic_fast(apic->vcpu->kvm, apic, &irq, &ignored, NULL)) return -EWOULDBLOCK; trace_kvm_apic_ipi((u32)data, irq.dest_id); } else { kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32)); } if (kvm_x86_ops.x2apic_icr_is_split) { kvm_lapic_set_reg(apic, APIC_ICR, data); kvm_lapic_set_reg(apic, APIC_ICR2, data >> 32); } else { kvm_lapic_set_reg64(apic, APIC_ICR, data); } trace_kvm_apic_write(APIC_ICR, data); return 0; } static int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) { return __kvm_x2apic_icr_write(apic, data, false); } int kvm_x2apic_icr_write_fast(struct kvm_lapic *apic, u64 data) { return __kvm_x2apic_icr_write(apic, data, true); } static u64 kvm_x2apic_icr_read(struct kvm_lapic *apic) { if (kvm_x86_ops.x2apic_icr_is_split) return (u64)kvm_lapic_get_reg(apic, APIC_ICR) | (u64)kvm_lapic_get_reg(apic, APIC_ICR2) << 32; return kvm_lapic_get_reg64(apic, APIC_ICR); } /* emulate APIC access in a trap manner */ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) { struct kvm_lapic *apic = vcpu->arch.apic; /* * ICR is a single 64-bit register when x2APIC is enabled, all others * registers hold 32-bit values. For legacy xAPIC, ICR writes need to * go down the common path to get the upper half from ICR2. * * Note, using the write helpers may incur an unnecessary write to the * virtual APIC state, but KVM needs to conditionally modify the value * in certain cases, e.g. to clear the ICR busy bit. The cost of extra * conditional branches is likely a wash relative to the cost of the * maybe-unecessary write, and both are in the noise anyways. */ if (apic_x2apic_mode(apic) && offset == APIC_ICR) WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_x2apic_icr_read(apic))); else kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset)); } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_write_nodecode); void kvm_free_lapic(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; if (!vcpu->arch.apic) { static_branch_dec(&kvm_has_noapic_vcpu); return; } hrtimer_cancel(&apic->lapic_timer.timer); if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE)) static_branch_slow_dec_deferred(&apic_hw_disabled); if (!apic->sw_enabled) static_branch_slow_dec_deferred(&apic_sw_disabled); if (apic->regs) free_page((unsigned long)apic->regs); kfree(apic); } /* *---------------------------------------------------------------------- * LAPIC interface *---------------------------------------------------------------------- */ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic)) return 0; return apic->lapic_timer.tscdeadline; } void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data) { struct kvm_lapic *apic = vcpu->arch.apic; if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic)) return; hrtimer_cancel(&apic->lapic_timer.timer); apic->lapic_timer.tscdeadline = data; start_apic_timer(apic); } void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) { apic_set_tpr(vcpu->arch.apic, (cr8 & 0x0f) << 4); } u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) { u64 tpr; tpr = (u64) kvm_lapic_get_reg(vcpu->arch.apic, APIC_TASKPRI); return (tpr & 0xf0) >> 4; } static void __kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value) { u64 old_value = vcpu->arch.apic_base; struct kvm_lapic *apic = vcpu->arch.apic; vcpu->arch.apic_base = value; if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) vcpu->arch.cpuid_dynamic_bits_dirty = true; if (!apic) return; /* update jump label if enable bit changes */ if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) { if (value & MSR_IA32_APICBASE_ENABLE) { kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); static_branch_slow_dec_deferred(&apic_hw_disabled); /* Check if there are APF page ready requests pending */ kvm_make_request(KVM_REQ_APF_READY, vcpu); } else { static_branch_inc(&apic_hw_disabled.key); atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); } } if ((old_value ^ value) & X2APIC_ENABLE) { if (value & X2APIC_ENABLE) kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id); else if (value & MSR_IA32_APICBASE_ENABLE) kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); } if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) { kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); kvm_x86_call(set_virtual_apic_mode)(vcpu); } apic->base_address = apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_BASE; if ((value & MSR_IA32_APICBASE_ENABLE) && apic->base_address != APIC_DEFAULT_PHYS_BASE) { kvm_set_apicv_inhibit(apic->vcpu->kvm, APICV_INHIBIT_REASON_APIC_BASE_MODIFIED); } } int kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value, bool host_initiated) { enum lapic_mode old_mode = kvm_get_apic_mode(vcpu); enum lapic_mode new_mode = kvm_apic_mode(value); if (vcpu->arch.apic_base == value) return 0; u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff | (guest_cpu_cap_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE); if ((value & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID) return 1; if (!host_initiated) { if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC) return 1; if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC) return 1; } __kvm_apic_set_base(vcpu, value); kvm_recalculate_apic_map(vcpu->kvm); return 0; } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_set_base); void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; /* * When APICv is enabled, KVM must always search the IRR for a pending * IRQ, as other vCPUs and devices can set IRR bits even if the vCPU * isn't running. If APICv is disabled, KVM _should_ search the IRR * for a pending IRQ. But KVM currently doesn't ensure *all* hardware, * e.g. CPUs and IOMMUs, has seen the change in state, i.e. searching * the IRR at this time could race with IRQ delivery from hardware that * still sees APICv as being enabled. * * FIXME: Ensure other vCPUs and devices observe the change in APICv * state prior to updating KVM's metadata caches, so that KVM * can safely search the IRR and set irr_pending accordingly. */ apic->irr_pending = true; if (apic->apicv_active) apic->isr_count = 1; else apic->isr_count = count_vectors(apic->regs + APIC_ISR); apic->highest_isr_cache = -1; } int kvm_alloc_apic_access_page(struct kvm *kvm) { void __user *hva; guard(mutex)(&kvm->slots_lock); if (kvm->arch.apic_access_memslot_enabled || kvm->arch.apic_access_memslot_inhibited) return 0; hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, APIC_DEFAULT_PHYS_BASE, PAGE_SIZE); if (IS_ERR(hva)) return PTR_ERR(hva); kvm->arch.apic_access_memslot_enabled = true; return 0; } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_alloc_apic_access_page); void kvm_inhibit_apic_access_page(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; if (!kvm->arch.apic_access_memslot_enabled) return; kvm_vcpu_srcu_read_unlock(vcpu); mutex_lock(&kvm->slots_lock); if (kvm->arch.apic_access_memslot_enabled) { __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0); /* * Clear "enabled" after the memslot is deleted so that a * different vCPU doesn't get a false negative when checking * the flag out of slots_lock. No additional memory barrier is * needed as modifying memslots requires waiting other vCPUs to * drop SRCU (see above), and false positives are ok as the * flag is rechecked after acquiring slots_lock. */ kvm->arch.apic_access_memslot_enabled = false; /* * Mark the memslot as inhibited to prevent reallocating the * memslot during vCPU creation, e.g. if a vCPU is hotplugged. */ kvm->arch.apic_access_memslot_inhibited = true; } mutex_unlock(&kvm->slots_lock); kvm_vcpu_srcu_read_lock(vcpu); } void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) { struct kvm_lapic *apic = vcpu->arch.apic; u64 msr_val; int i; kvm_x86_call(apicv_pre_state_restore)(vcpu); if (!init_event) { msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_reset_bsp(vcpu)) msr_val |= MSR_IA32_APICBASE_BSP; /* * Use the inner helper to avoid an extra recalcuation of the * optimized APIC map if some other task has dirtied the map. * The recalculation needed for this vCPU will be done after * all APIC state has been initialized (see below). */ __kvm_apic_set_base(vcpu, msr_val); } if (!apic) return; /* Stop the timer in case it's a reset to an active apic */ hrtimer_cancel(&apic->lapic_timer.timer); /* The xAPIC ID is set at RESET even if the APIC was already enabled. */ if (!init_event) kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); kvm_apic_set_version(apic->vcpu); for (i = 0; i < apic->nr_lvt_entries; i++) kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED); apic_update_lvtt(apic); if (kvm_vcpu_is_reset_bsp(vcpu) && kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED)) kvm_lapic_set_reg(apic, APIC_LVT0, SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); kvm_apic_set_dfr(apic, 0xffffffffU); apic_set_spiv(apic, 0xff); kvm_lapic_set_reg(apic, APIC_TASKPRI, 0); if (!apic_x2apic_mode(apic)) kvm_apic_set_ldr(apic, 0); kvm_lapic_set_reg(apic, APIC_ESR, 0); if (!apic_x2apic_mode(apic)) { kvm_lapic_set_reg(apic, APIC_ICR, 0); kvm_lapic_set_reg(apic, APIC_ICR2, 0); } else { kvm_lapic_set_reg64(apic, APIC_ICR, 0); } kvm_lapic_set_reg(apic, APIC_TDCR, 0); kvm_lapic_set_reg(apic, APIC_TMICT, 0); for (i = 0; i < 8; i++) { kvm_lapic_set_reg(apic, APIC_IRR + 0x10 * i, 0); kvm_lapic_set_reg(apic, APIC_ISR + 0x10 * i, 0); kvm_lapic_set_reg(apic, APIC_TMR + 0x10 * i, 0); } kvm_apic_update_apicv(vcpu); update_divide_count(apic); atomic_set(&apic->lapic_timer.pending, 0); vcpu->arch.pv_eoi.msr_val = 0; apic_update_ppr(apic); if (apic->apicv_active) { kvm_x86_call(apicv_post_state_restore)(vcpu); kvm_x86_call(hwapic_isr_update)(vcpu, -1); } vcpu->arch.apic_arb_prio = 0; vcpu->arch.apic_attention = 0; kvm_recalculate_apic_map(vcpu->kvm); } /* *---------------------------------------------------------------------- * timer interface *---------------------------------------------------------------------- */ static bool lapic_is_periodic(struct kvm_lapic *apic) { return apic_lvtt_period(apic); } int apic_has_pending_timer(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; if (apic_enabled(apic) && apic_lvt_enabled(apic, APIC_LVTT)) return atomic_read(&apic->lapic_timer.pending); return 0; } int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) { u32 reg = kvm_lapic_get_reg(apic, lvt_type); int vector, mode, trig_mode; int r; if (kvm_apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) { vector = reg & APIC_VECTOR_MASK; mode = reg & APIC_MODE_MASK; trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; r = __apic_accept_irq(apic, mode, vector, 1, trig_mode, NULL); if (r && lvt_type == APIC_LVTPC && guest_cpuid_is_intel_compatible(apic->vcpu)) kvm_lapic_set_reg(apic, APIC_LVTPC, reg | APIC_LVT_MASKED); return r; } return 0; } void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; if (apic) kvm_apic_local_deliver(apic, APIC_LVT0); } static const struct kvm_io_device_ops apic_mmio_ops = { .read = apic_mmio_read, .write = apic_mmio_write, }; static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) { struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, lapic_timer); apic_timer_expired(apic, true); if (lapic_is_periodic(apic) && !WARN_ON_ONCE(!apic->lapic_timer.period)) { advance_periodic_target_expiration(apic); hrtimer_set_expires(&ktimer->timer, ktimer->target_expiration); return HRTIMER_RESTART; } else return HRTIMER_NORESTART; } int kvm_create_lapic(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic; ASSERT(vcpu != NULL); if (!irqchip_in_kernel(vcpu->kvm)) { static_branch_inc(&kvm_has_noapic_vcpu); return 0; } apic = kzalloc(sizeof(*apic), GFP_KERNEL_ACCOUNT); if (!apic) goto nomem; vcpu->arch.apic = apic; if (kvm_x86_ops.alloc_apic_backing_page) apic->regs = kvm_x86_call(alloc_apic_backing_page)(vcpu); else apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!apic->regs) { printk(KERN_ERR "malloc apic regs error for vcpu %x\n", vcpu->vcpu_id); goto nomem_free_apic; } apic->vcpu = vcpu; apic->nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu); hrtimer_setup(&apic->lapic_timer.timer, apic_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); if (lapic_timer_advance) apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT; /* * Stuff the APIC ENABLE bit in lieu of temporarily incrementing * apic_hw_disabled; the full RESET value is set by kvm_lapic_reset(). */ vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE; static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */ kvm_iodevice_init(&apic->dev, &apic_mmio_ops); /* * Defer evaluating inhibits until the vCPU is first run, as this vCPU * will not get notified of any changes until this vCPU is visible to * other vCPUs (marked online and added to the set of vCPUs). * * Opportunistically mark APICv active as VMX in particularly is highly * unlikely to have inhibits. Ignore the current per-VM APICv state so * that vCPU creation is guaranteed to run with a deterministic value, * the request will ensure the vCPU gets the correct state before VM-Entry. */ if (enable_apicv) { apic->apicv_active = true; kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); } return 0; nomem_free_apic: kfree(apic); vcpu->arch.apic = NULL; nomem: return -ENOMEM; } int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; u32 ppr; if (!kvm_apic_present(vcpu)) return -1; if (apic->guest_apic_protected) return -1; __apic_update_ppr(apic, &ppr); return apic_has_interrupt_for_ppr(apic, ppr); } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_has_interrupt); int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) { u32 lvt0 = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVT0); if (!kvm_apic_hw_enabled(vcpu->arch.apic)) return 1; if ((lvt0 & APIC_LVT_MASKED) == 0 && GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) return 1; return 0; } void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; if (atomic_read(&apic->lapic_timer.pending) > 0) { kvm_apic_inject_pending_timer_irqs(apic); atomic_set(&apic->lapic_timer.pending, 0); } } void kvm_apic_ack_interrupt(struct kvm_vcpu *vcpu, int vector) { struct kvm_lapic *apic = vcpu->arch.apic; u32 ppr; if (WARN_ON_ONCE(vector < 0 || !apic)) return; /* * We get here even with APIC virtualization enabled, if doing * nested virtualization and L1 runs with the "acknowledge interrupt * on exit" mode. Then we cannot inject the interrupt via RVI, * because the process would deliver it through the IDT. */ apic_clear_irr(vector, apic); if (kvm_hv_synic_auto_eoi_set(vcpu, vector)) { /* * For auto-EOI interrupts, there might be another pending * interrupt above PPR, so check whether to raise another * KVM_REQ_EVENT. */ apic_update_ppr(apic); } else { /* * For normal interrupts, PPR has been raised and there cannot * be a higher-priority pending interrupt---except if there was * a concurrent interrupt injection, but that would have * triggered KVM_REQ_EVENT already. */ apic_set_isr(vector, apic); __apic_update_ppr(apic, &ppr); } } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_ack_interrupt); static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s, bool set) { if (apic_x2apic_mode(vcpu->arch.apic)) { u32 x2apic_id = kvm_x2apic_id(vcpu->arch.apic); u32 *id = (u32 *)(s->regs + APIC_ID); u32 *ldr = (u32 *)(s->regs + APIC_LDR); u64 icr; if (vcpu->kvm->arch.x2apic_format) { if (*id != x2apic_id) return -EINVAL; } else { /* * Ignore the userspace value when setting APIC state. * KVM's model is that the x2APIC ID is readonly, e.g. * KVM only supports delivering interrupts to KVM's * version of the x2APIC ID. However, for backwards * compatibility, don't reject attempts to set a * mismatched ID for userspace that hasn't opted into * x2apic_format. */ if (set) *id = x2apic_id; else *id = x2apic_id << 24; } /* * In x2APIC mode, the LDR is fixed and based on the id. And * if the ICR is _not_ split, ICR is internally a single 64-bit * register, but needs to be split to ICR+ICR2 in userspace for * backwards compatibility. */ if (set) *ldr = kvm_apic_calc_x2apic_ldr(x2apic_id); if (!kvm_x86_ops.x2apic_icr_is_split) { if (set) { icr = apic_get_reg(s->regs, APIC_ICR) | (u64)apic_get_reg(s->regs, APIC_ICR2) << 32; apic_set_reg64(s->regs, APIC_ICR, icr); } else { icr = apic_get_reg64(s->regs, APIC_ICR); apic_set_reg(s->regs, APIC_ICR2, icr >> 32); } } } return 0; } int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { memcpy(s->regs, vcpu->arch.apic->regs, sizeof(*s)); /* * Get calculated timer current count for remaining timer period (if * any) and store it in the returned register set. */ apic_set_reg(s->regs, APIC_TMCCT, __apic_read(vcpu->arch.apic, APIC_TMCCT)); return kvm_apic_state_fixup(vcpu, s, false); } int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { struct kvm_lapic *apic = vcpu->arch.apic; int r; kvm_x86_call(apicv_pre_state_restore)(vcpu); /* set SPIV separately to get count of SW disabled APICs right */ apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV))); r = kvm_apic_state_fixup(vcpu, s, true); if (r) { kvm_recalculate_apic_map(vcpu->kvm); return r; } memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s)); atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); kvm_recalculate_apic_map(vcpu->kvm); kvm_apic_set_version(vcpu); apic_update_ppr(apic); cancel_apic_timer(apic); apic->lapic_timer.expired_tscdeadline = 0; apic_update_lvtt(apic); apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); update_divide_count(apic); __start_apic_timer(apic, APIC_TMCCT); kvm_lapic_set_reg(apic, APIC_TMCCT, 0); kvm_apic_update_apicv(vcpu); if (apic->apicv_active) { kvm_x86_call(apicv_post_state_restore)(vcpu); kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); } kvm_make_request(KVM_REQ_EVENT, vcpu); #ifdef CONFIG_KVM_IOAPIC if (ioapic_in_kernel(vcpu->kvm)) kvm_rtc_eoi_tracking_restore_one(vcpu); #endif vcpu->arch.apic_arb_prio = 0; return 0; } void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) { struct hrtimer *timer; if (!lapic_in_kernel(vcpu) || kvm_can_post_timer_interrupt(vcpu)) return; timer = &vcpu->arch.apic->lapic_timer.timer; if (hrtimer_cancel(timer)) hrtimer_start_expires(timer, HRTIMER_MODE_ABS_HARD); } /* * apic_sync_pv_eoi_from_guest - called on vmexit or cancel interrupt * * Detect whether guest triggered PV EOI since the * last entry. If yes, set EOI on guests's behalf. * Clear PV EOI in guest memory in any case. */ static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu, struct kvm_lapic *apic) { int vector; /* * PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host * and KVM_PV_EOI_ENABLED in guest memory as follows: * * KVM_APIC_PV_EOI_PENDING is unset: * -> host disabled PV EOI. * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is set: * -> host enabled PV EOI, guest did not execute EOI yet. * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is unset: * -> host enabled PV EOI, guest executed EOI. */ BUG_ON(!pv_eoi_enabled(vcpu)); if (pv_eoi_test_and_clr_pending(vcpu)) return; vector = apic_set_eoi(apic); trace_kvm_pv_eoi(apic, vector); } void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) { u32 data; if (test_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention)) apic_sync_pv_eoi_from_guest(vcpu, vcpu->arch.apic); if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, sizeof(u32))) return; apic_set_tpr(vcpu->arch.apic, data & 0xff); } /* * apic_sync_pv_eoi_to_guest - called before vmentry * * Detect whether it's safe to enable PV EOI and * if yes do so. */ static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu, struct kvm_lapic *apic) { if (!pv_eoi_enabled(vcpu) || /* IRR set or many bits in ISR: could be nested. */ apic->irr_pending || /* Cache not set: could be safe but we don't bother. */ apic->highest_isr_cache == -1 || /* Need EOI to update ioapic. */ kvm_ioapic_handles_vector(apic, apic->highest_isr_cache)) { /* * PV EOI was disabled by apic_sync_pv_eoi_from_guest * so we need not do anything here. */ return; } pv_eoi_set_pending(apic->vcpu); } void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) { u32 data, tpr; int max_irr, max_isr; struct kvm_lapic *apic = vcpu->arch.apic; apic_sync_pv_eoi_to_guest(vcpu, apic); if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; tpr = kvm_lapic_get_reg(apic, APIC_TASKPRI) & 0xff; max_irr = apic_find_highest_irr(apic); if (max_irr < 0) max_irr = 0; max_isr = apic_find_highest_isr(apic); if (max_isr < 0) max_isr = 0; data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24); kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, sizeof(u32)); } int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) { if (vapic_addr) { if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apic->vapic_cache, vapic_addr, sizeof(u32))) return -EINVAL; __set_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention); } else { __clear_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention); } vcpu->arch.apic->vapic_addr = vapic_addr; return 0; } static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data) { u32 low; if (reg == APIC_ICR) { *data = kvm_x2apic_icr_read(apic); return 0; } if (kvm_lapic_reg_read(apic, reg, 4, &low)) return 1; *data = low; return 0; } static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data) { /* * ICR is a 64-bit register in x2APIC mode (and Hyper-V PV vAPIC) and * can be written as such, all other registers remain accessible only * through 32-bit reads/writes. */ if (reg == APIC_ICR) return kvm_x2apic_icr_write(apic, data); /* Bits 63:32 are reserved in all other registers. */ if (data >> 32) return 1; return kvm_lapic_reg_write(apic, reg, (u32)data); } int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data) { struct kvm_lapic *apic = vcpu->arch.apic; u32 reg = (msr - APIC_BASE_MSR) << 4; if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic)) return 1; return kvm_lapic_msr_write(apic, reg, data); } int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) { struct kvm_lapic *apic = vcpu->arch.apic; u32 reg = (msr - APIC_BASE_MSR) << 4; if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic)) return 1; return kvm_lapic_msr_read(apic, reg, data); } int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data) { if (!lapic_in_kernel(vcpu)) return 1; return kvm_lapic_msr_write(vcpu->arch.apic, reg, data); } int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) { if (!lapic_in_kernel(vcpu)) return 1; return kvm_lapic_msr_read(vcpu->arch.apic, reg, data); } int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len) { u64 addr = data & ~KVM_MSR_ENABLED; struct gfn_to_hva_cache *ghc = &vcpu->arch.pv_eoi.data; unsigned long new_len; int ret; if (!IS_ALIGNED(addr, 4)) return 1; if (data & KVM_MSR_ENABLED) { if (addr == ghc->gpa && len <= ghc->len) new_len = ghc->len; else new_len = len; ret = kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, addr, new_len); if (ret) return ret; } vcpu->arch.pv_eoi.msr_val = data; return 0; } int kvm_apic_accept_events(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; u8 sipi_vector; int r; if (!kvm_apic_has_pending_init_or_sipi(vcpu)) return 0; if (is_guest_mode(vcpu)) { r = kvm_check_nested_events(vcpu); if (r < 0) return r == -EBUSY ? 0 : r; /* * Continue processing INIT/SIPI even if a nested VM-Exit * occurred, e.g. pending SIPIs should be dropped if INIT+SIPI * are blocked as a result of transitioning to VMX root mode. */ } /* * INITs are blocked while CPU is in specific states (SMM, VMX root * mode, SVM with GIF=0), while SIPIs are dropped if the CPU isn't in * wait-for-SIPI (WFS). */ if (!kvm_apic_init_sipi_allowed(vcpu)) { WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED); clear_bit(KVM_APIC_SIPI, &apic->pending_events); return 0; } if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) { kvm_vcpu_reset(vcpu, true); if (kvm_vcpu_is_bsp(apic->vcpu)) kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); else kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); } if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events)) { if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { /* evaluate pending_events before reading the vector */ smp_rmb(); sipi_vector = apic->sipi_vector; kvm_x86_call(vcpu_deliver_sipi_vector)(vcpu, sipi_vector); kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); } } return 0; } void kvm_lapic_exit(void) { static_key_deferred_flush(&apic_hw_disabled); WARN_ON(static_branch_unlikely(&apic_hw_disabled.key)); static_key_deferred_flush(&apic_sw_disabled); WARN_ON(static_branch_unlikely(&apic_sw_disabled.key)); } |
| 40 1474 180 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PID_NS_H #define _LINUX_PID_NS_H #include <linux/sched.h> #include <linux/bug.h> #include <linux/mm.h> #include <linux/workqueue.h> #include <linux/threads.h> #include <linux/nsproxy.h> #include <linux/ns_common.h> #include <linux/idr.h> /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ #define MAX_PID_NS_LEVEL 32 struct fs_pin; #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) /* modes for vm.memfd_noexec sysctl */ #define MEMFD_NOEXEC_SCOPE_EXEC 0 /* MFD_EXEC implied if unset */ #define MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL 1 /* MFD_NOEXEC_SEAL implied if unset */ #define MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED 2 /* same as 1, except MFD_EXEC rejected */ #endif struct pid_namespace { struct idr idr; struct rcu_head rcu; unsigned int pid_allocated; struct task_struct *child_reaper; struct kmem_cache *pid_cachep; unsigned int level; int pid_max; struct pid_namespace *parent; #ifdef CONFIG_BSD_PROCESS_ACCT struct fs_pin *bacct; #endif struct user_namespace *user_ns; struct ucounts *ucounts; int reboot; /* group exit code if this pidns was rebooted */ struct ns_common ns; struct work_struct work; #ifdef CONFIG_SYSCTL struct ctl_table_set set; struct ctl_table_header *sysctls; #if defined(CONFIG_MEMFD_CREATE) int memfd_noexec_scope; #endif #endif } __randomize_layout; extern struct pid_namespace init_pid_ns; #define PIDNS_ADDING (1U << 31) #ifdef CONFIG_PID_NS static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) { return container_of(ns, struct pid_namespace, ns); } static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) { ns_ref_inc(ns); return ns; } #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) { int scope = MEMFD_NOEXEC_SCOPE_EXEC; for (; ns; ns = ns->parent) scope = max(scope, READ_ONCE(ns->memfd_noexec_scope)); return scope; } #else static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) { return 0; } #endif extern struct pid_namespace *copy_pid_ns(u64 flags, struct user_namespace *user_ns, struct pid_namespace *ns); extern void zap_pid_ns_processes(struct pid_namespace *pid_ns); extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd); extern void put_pid_ns(struct pid_namespace *ns); extern bool pidns_is_ancestor(struct pid_namespace *child, struct pid_namespace *ancestor); #else /* !CONFIG_PID_NS */ #include <linux/err.h> static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) { return ns; } static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) { return 0; } static inline struct pid_namespace *copy_pid_ns(u64 flags, struct user_namespace *user_ns, struct pid_namespace *ns) { if (flags & CLONE_NEWPID) ns = ERR_PTR(-EINVAL); return ns; } static inline void put_pid_ns(struct pid_namespace *ns) { } static inline void zap_pid_ns_processes(struct pid_namespace *ns) { BUG(); } static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) { return 0; } static inline bool pidns_is_ancestor(struct pid_namespace *child, struct pid_namespace *ancestor) { return false; } #endif /* CONFIG_PID_NS */ extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk); void pidhash_init(void); void pid_idr_init(void); int register_pidns_sysctls(struct pid_namespace *pidns); void unregister_pidns_sysctls(struct pid_namespace *pidns); static inline bool task_is_in_init_pid_ns(struct task_struct *tsk) { return task_active_pid_ns(tsk) == &init_pid_ns; } #endif /* _LINUX_PID_NS_H */ |
| 7 7 6 6 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 | // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2014 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:mac type */ #include <linux/jhash.h> #include <linux/module.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/if_ether.h> #include <net/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 #define IPSET_TYPE_REV_MAX 1 /* bucketsize, initval support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:mac"); /* Type specific function prefix */ #define HTYPE hash_mac /* Member elements */ struct hash_mac4_elem { /* Zero valued IP addresses cannot be stored */ union { unsigned char ether[ETH_ALEN]; __be32 foo[2]; }; }; /* Common functions */ static bool hash_mac4_data_equal(const struct hash_mac4_elem *e1, const struct hash_mac4_elem *e2, u32 *multi) { return ether_addr_equal(e1->ether, e2->ether); } static bool hash_mac4_data_list(struct sk_buff *skb, const struct hash_mac4_elem *e) { if (nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether)) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_mac4_data_next(struct hash_mac4_elem *next, const struct hash_mac4_elem *e) { } #define MTYPE hash_mac4 #define HOST_MASK 32 #define IP_SET_EMIT_CREATE #define IP_SET_PROTO_UNDEF #include "ip_set_hash_gen.h" static int hash_mac4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_mac4_elem e = { { .foo[0] = 0, .foo[1] = 0 } }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (skb_mac_header(skb) < skb->head || (skb_mac_header(skb) + ETH_HLEN) > skb->data) return -EINVAL; if (opt->flags & IPSET_DIM_ONE_SRC) ether_addr_copy(e.ether, eth_hdr(skb)->h_source); else ether_addr_copy(e.ether, eth_hdr(skb)->h_dest); if (is_zero_ether_addr(e.ether)) return -EINVAL; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_mac4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_mac4_elem e = { { .foo[0] = 0, .foo[1] = 0 } }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_ETHER] || nla_len(tb[IPSET_ATTR_ETHER]) != ETH_ALEN)) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; ether_addr_copy(e.ether, nla_data(tb[IPSET_ATTR_ETHER])); if (is_zero_ether_addr(e.ether)) return -IPSET_ERR_HASH_ELEM; return adtfn(set, &e, &ext, &ext, flags); } static struct ip_set_type hash_mac_type __read_mostly = { .name = "hash:mac", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_MAC, .dimension = IPSET_DIM_ONE, .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_mac_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_ETHER] = { .type = NLA_BINARY, .len = ETH_ALEN }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init hash_mac_init(void) { return ip_set_type_register(&hash_mac_type); } static void __exit hash_mac_fini(void) { rcu_barrier(); ip_set_type_unregister(&hash_mac_type); } module_init(hash_mac_init); module_exit(hash_mac_fini); |
| 3204 3209 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2020 Google LLC. */ #ifndef _LINUX_BPF_LSM_H #define _LINUX_BPF_LSM_H #include <linux/sched.h> #include <linux/bpf.h> #include <linux/bpf_verifier.h> #include <linux/lsm_hooks.h> #ifdef CONFIG_BPF_LSM #define LSM_HOOK(RET, DEFAULT, NAME, ...) \ RET bpf_lsm_##NAME(__VA_ARGS__); #include <linux/lsm_hook_defs.h> #undef LSM_HOOK struct bpf_storage_blob { struct bpf_local_storage __rcu *storage; }; extern struct lsm_blob_sizes bpf_lsm_blob_sizes; int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog); bool bpf_lsm_is_sleepable_hook(u32 btf_id); bool bpf_lsm_is_trusted(const struct bpf_prog *prog); static inline struct bpf_storage_blob *bpf_inode( const struct inode *inode) { if (unlikely(!inode->i_security)) return NULL; return inode->i_security + bpf_lsm_blob_sizes.lbs_inode; } extern const struct bpf_func_proto bpf_inode_storage_get_proto; extern const struct bpf_func_proto bpf_inode_storage_delete_proto; void bpf_inode_storage_free(struct inode *inode); void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func); int bpf_lsm_get_retval_range(const struct bpf_prog *prog, struct bpf_retval_range *range); int bpf_set_dentry_xattr_locked(struct dentry *dentry, const char *name__str, const struct bpf_dynptr *value_p, int flags); int bpf_remove_dentry_xattr_locked(struct dentry *dentry, const char *name__str); bool bpf_lsm_has_d_inode_locked(const struct bpf_prog *prog); #else /* !CONFIG_BPF_LSM */ static inline bool bpf_lsm_is_sleepable_hook(u32 btf_id) { return false; } static inline bool bpf_lsm_is_trusted(const struct bpf_prog *prog) { return false; } static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) { return -EOPNOTSUPP; } static inline struct bpf_storage_blob *bpf_inode( const struct inode *inode) { return NULL; } static inline void bpf_inode_storage_free(struct inode *inode) { } static inline void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func) { } static inline int bpf_lsm_get_retval_range(const struct bpf_prog *prog, struct bpf_retval_range *range) { return -EOPNOTSUPP; } static inline int bpf_set_dentry_xattr_locked(struct dentry *dentry, const char *name__str, const struct bpf_dynptr *value_p, int flags) { return -EOPNOTSUPP; } static inline int bpf_remove_dentry_xattr_locked(struct dentry *dentry, const char *name__str) { return -EOPNOTSUPP; } static inline bool bpf_lsm_has_d_inode_locked(const struct bpf_prog *prog) { return false; } #endif /* CONFIG_BPF_LSM */ #endif /* _LINUX_BPF_LSM_H */ |
| 32 32 31 31 20 20 37 32 32 29 32 27 36 26 26 11 6 103 103 36 36 102 103 103 103 7 31 31 31 36 31 31 36 31 31 30 30 30 78 82 89 78 95 122 121 6 6 6 6 6 6 36 36 36 36 36 31 7 7 7 7 7 34 1 118 118 117 117 117 7 118 38 3 1 38 38 37 37 37 37 37 32 36 36 36 36 37 1 27 2 27 28 28 28 1 27 14 14 1 1 14 26 26 9 21 20 19 20 20 16 16 8 21 21 2 2 2 20 14 26 8 73 73 73 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 1 1 1 1 2 2 2 2 2 2 2 2 1 2 2 2 29 28 28 27 27 28 27 27 25 25 31 25 3 3 3 2 32 32 32 3 2 3 8 7 8 1 7 1 5 5 4 1 8 3 1 2 3 3 3 3 3 3 5 5 5 5 12 12 12 12 32 9 9 9 1 9 9 9 9 9 9 88 88 88 89 89 89 88 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 2 2 70 70 70 70 70 70 70 70 70 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 | // SPDX-License-Identifier: GPL-2.0-only /* * * Copyright (C) 2011 Novell Inc. */ #include <linux/fs.h> #include <linux/slab.h> #include <linux/namei.h> #include <linux/file.h> #include <linux/filelock.h> #include <linux/xattr.h> #include <linux/rbtree.h> #include <linux/security.h> #include <linux/cred.h> #include <linux/ratelimit.h> #include <linux/overflow.h> #include "overlayfs.h" struct ovl_cache_entry { unsigned int len; unsigned int type; u64 real_ino; u64 ino; struct list_head l_node; struct rb_node node; struct ovl_cache_entry *next_maybe_whiteout; bool is_upper; bool is_whiteout; bool check_xwhiteout; const char *c_name; int c_len; char name[]; }; struct ovl_dir_cache { long refcount; u64 version; struct list_head entries; struct rb_root root; }; struct ovl_readdir_data { struct dir_context ctx; struct dentry *dentry; bool is_lowest; struct rb_root *root; struct list_head *list; struct list_head middle; struct ovl_cache_entry *first_maybe_whiteout; struct unicode_map *map; int count; int err; bool is_upper; bool d_type_supported; bool in_xwhiteouts_dir; }; struct ovl_dir_file { bool is_real; bool is_upper; struct ovl_dir_cache *cache; struct list_head *cursor; struct file *realfile; struct file *upperfile; }; static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n) { return rb_entry(n, struct ovl_cache_entry, node); } static int ovl_casefold(struct ovl_readdir_data *rdd, const char *str, int len, char **dst) { const struct qstr qstr = { .name = str, .len = len }; char *cf_name; int cf_len; if (!IS_ENABLED(CONFIG_UNICODE) || !rdd->map || is_dot_dotdot(str, len)) return 0; cf_name = kmalloc(NAME_MAX, GFP_KERNEL); if (!cf_name) { rdd->err = -ENOMEM; return -ENOMEM; } cf_len = utf8_casefold(rdd->map, &qstr, cf_name, NAME_MAX); if (cf_len > 0) *dst = cf_name; else kfree(cf_name); return cf_len; } static bool ovl_cache_entry_find_link(const char *name, int len, struct rb_node ***link, struct rb_node **parent) { bool found = false; struct rb_node **newp = *link; while (!found && *newp) { int cmp; struct ovl_cache_entry *tmp; *parent = *newp; tmp = ovl_cache_entry_from_node(*newp); cmp = strncmp(name, tmp->c_name, len); if (cmp > 0) newp = &tmp->node.rb_right; else if (cmp < 0 || len < tmp->c_len) newp = &tmp->node.rb_left; else found = true; } *link = newp; return found; } static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root, const char *name, int len) { struct rb_node *node = root->rb_node; int cmp; while (node) { struct ovl_cache_entry *p = ovl_cache_entry_from_node(node); cmp = strncmp(name, p->c_name, len); if (cmp > 0) node = p->node.rb_right; else if (cmp < 0 || len < p->c_len) node = p->node.rb_left; else return p; } return NULL; } static bool ovl_calc_d_ino(struct ovl_readdir_data *rdd, struct ovl_cache_entry *p) { /* Don't care if not doing ovl_iter() */ if (!rdd->dentry) return false; /* Always recalc d_ino when remapping lower inode numbers */ if (ovl_xino_bits(OVL_FS(rdd->dentry->d_sb))) return true; /* Always recalc d_ino for parent */ if (strcmp(p->name, "..") == 0) return true; /* If this is lower, then native d_ino will do */ if (!rdd->is_upper) return false; /* * Recalc d_ino for '.' and for all entries if dir is impure (contains * copied up entries) */ if ((p->name[0] == '.' && p->len == 1) || ovl_test_flag(OVL_IMPURE, d_inode(rdd->dentry))) return true; return false; } static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd, const char *name, int len, const char *c_name, int c_len, u64 ino, unsigned int d_type) { struct ovl_cache_entry *p; p = kmalloc(struct_size(p, name, len + 1), GFP_KERNEL); if (!p) return NULL; memcpy(p->name, name, len); p->name[len] = '\0'; p->len = len; p->type = d_type; p->real_ino = ino; p->ino = ino; /* Defer setting d_ino for upper entry to ovl_iterate() */ if (ovl_calc_d_ino(rdd, p)) p->ino = 0; p->is_upper = rdd->is_upper; p->is_whiteout = false; /* Defer check for overlay.whiteout to ovl_iterate() */ p->check_xwhiteout = rdd->in_xwhiteouts_dir && d_type == DT_REG; if (c_name && c_name != name) { p->c_name = c_name; p->c_len = c_len; } else { p->c_name = p->name; p->c_len = len; } if (d_type == DT_CHR) { p->next_maybe_whiteout = rdd->first_maybe_whiteout; rdd->first_maybe_whiteout = p; } return p; } /* Return 0 for found, 1 for added, <0 for error */ static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, const char *name, int len, const char *c_name, int c_len, u64 ino, unsigned int d_type) { struct rb_node **newp = &rdd->root->rb_node; struct rb_node *parent = NULL; struct ovl_cache_entry *p; if (ovl_cache_entry_find_link(c_name, c_len, &newp, &parent)) return 0; p = ovl_cache_entry_new(rdd, name, len, c_name, c_len, ino, d_type); if (p == NULL) { rdd->err = -ENOMEM; return -ENOMEM; } list_add_tail(&p->l_node, rdd->list); rb_link_node(&p->node, parent, newp); rb_insert_color(&p->node, rdd->root); return 1; } /* Return 0 for found, 1 for added, <0 for error */ static int ovl_fill_lowest(struct ovl_readdir_data *rdd, const char *name, int namelen, const char *c_name, int c_len, loff_t offset, u64 ino, unsigned int d_type) { struct ovl_cache_entry *p; p = ovl_cache_entry_find(rdd->root, c_name, c_len); if (p) { list_move_tail(&p->l_node, &rdd->middle); return 0; } else { p = ovl_cache_entry_new(rdd, name, namelen, c_name, c_len, ino, d_type); if (p == NULL) rdd->err = -ENOMEM; else list_add_tail(&p->l_node, &rdd->middle); } return rdd->err ?: 1; } static void ovl_cache_entry_free(struct ovl_cache_entry *p) { if (p->c_name != p->name) kfree(p->c_name); kfree(p); } void ovl_cache_free(struct list_head *list) { struct ovl_cache_entry *p; struct ovl_cache_entry *n; list_for_each_entry_safe(p, n, list, l_node) ovl_cache_entry_free(p); INIT_LIST_HEAD(list); } void ovl_dir_cache_free(struct inode *inode) { struct ovl_dir_cache *cache = ovl_dir_cache(inode); if (cache) { ovl_cache_free(&cache->entries); kfree(cache); } } static void ovl_cache_put(struct ovl_dir_file *od, struct inode *inode) { struct ovl_dir_cache *cache = od->cache; WARN_ON(cache->refcount <= 0); cache->refcount--; if (!cache->refcount) { if (ovl_dir_cache(inode) == cache) ovl_set_dir_cache(inode, NULL); ovl_cache_free(&cache->entries); kfree(cache); } } static bool ovl_fill_merge(struct dir_context *ctx, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { struct ovl_readdir_data *rdd = container_of(ctx, struct ovl_readdir_data, ctx); struct ovl_fs *ofs = OVL_FS(rdd->dentry->d_sb); const char *c_name = NULL; char *cf_name = NULL; int c_len = 0, ret; if (ofs->casefold) c_len = ovl_casefold(rdd, name, namelen, &cf_name); if (rdd->err) return false; if (c_len <= 0) { c_name = name; c_len = namelen; } else { c_name = cf_name; } rdd->count++; if (!rdd->is_lowest) ret = ovl_cache_entry_add_rb(rdd, name, namelen, c_name, c_len, ino, d_type); else ret = ovl_fill_lowest(rdd, name, namelen, c_name, c_len, offset, ino, d_type); /* * If ret == 1, that means that c_name is being used as part of struct * ovl_cache_entry and will be freed at ovl_cache_free(). Otherwise, * c_name was found in the rb-tree so we can free it here. */ if (ret != 1 && c_name != name) kfree(c_name); return ret >= 0; } static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data *rdd) { struct dentry *dentry, *dir = path->dentry; while (rdd->first_maybe_whiteout) { struct ovl_cache_entry *p = rdd->first_maybe_whiteout; rdd->first_maybe_whiteout = p->next_maybe_whiteout; dentry = lookup_one_positive_killable(mnt_idmap(path->mnt), &QSTR_LEN(p->name, p->len), dir); if (!IS_ERR(dentry)) { p->is_whiteout = ovl_is_whiteout(dentry); dput(dentry); } else if (PTR_ERR(dentry) == -EINTR) { return -EINTR; } } return 0; } static inline int ovl_dir_read(const struct path *realpath, struct ovl_readdir_data *rdd) { struct file *realfile; int err; realfile = ovl_path_open(realpath, O_RDONLY | O_LARGEFILE); if (IS_ERR(realfile)) return PTR_ERR(realfile); rdd->first_maybe_whiteout = NULL; rdd->ctx.pos = 0; do { rdd->count = 0; rdd->err = 0; err = iterate_dir(realfile, &rdd->ctx); if (err >= 0) err = rdd->err; } while (!err && rdd->count); if (!err && rdd->first_maybe_whiteout && rdd->dentry) err = ovl_check_whiteouts(realpath, rdd); fput(realfile); return err; } static void ovl_dir_reset(struct file *file) { struct ovl_dir_file *od = file->private_data; struct ovl_dir_cache *cache = od->cache; struct inode *inode = file_inode(file); bool is_real; if (cache && ovl_inode_version_get(inode) != cache->version) { ovl_cache_put(od, inode); od->cache = NULL; od->cursor = NULL; } is_real = ovl_dir_is_real(inode); if (od->is_real != is_real) { /* is_real can only become false when dir is copied up */ if (WARN_ON(is_real)) return; od->is_real = false; } } static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list, struct rb_root *root) { int err; struct path realpath; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_merge, .ctx.count = INT_MAX, .dentry = dentry, .list = list, .root = root, .is_lowest = false, .map = NULL, }; int idx, next; const struct ovl_layer *layer; struct ovl_fs *ofs = OVL_FS(dentry->d_sb); for (idx = 0; idx != -1; idx = next) { next = ovl_path_next(idx, dentry, &realpath, &layer); if (ofs->casefold) rdd.map = sb_encoding(realpath.dentry->d_sb); rdd.is_upper = ovl_dentry_upper(dentry) == realpath.dentry; rdd.in_xwhiteouts_dir = layer->has_xwhiteouts && ovl_dentry_has_xwhiteouts(dentry); if (next != -1) { err = ovl_dir_read(&realpath, &rdd); if (err) break; } else { /* * Insert lowest layer entries before upper ones, this * allows offsets to be reasonably constant */ list_add(&rdd.middle, rdd.list); rdd.is_lowest = true; err = ovl_dir_read(&realpath, &rdd); list_del(&rdd.middle); } } return err; } static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos) { struct list_head *p; loff_t off = 0; list_for_each(p, &od->cache->entries) { if (off >= pos) break; off++; } /* Cursor is safe since the cache is stable */ od->cursor = p; } static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) { int res; struct ovl_dir_cache *cache; struct inode *inode = d_inode(dentry); cache = ovl_dir_cache(inode); if (cache && ovl_inode_version_get(inode) == cache->version) { WARN_ON(!cache->refcount); cache->refcount++; return cache; } ovl_set_dir_cache(d_inode(dentry), NULL); cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL); if (!cache) return ERR_PTR(-ENOMEM); cache->refcount = 1; INIT_LIST_HEAD(&cache->entries); cache->root = RB_ROOT; res = ovl_dir_read_merged(dentry, &cache->entries, &cache->root); if (res) { ovl_cache_free(&cache->entries); kfree(cache); return ERR_PTR(res); } cache->version = ovl_inode_version_get(inode); ovl_set_dir_cache(inode, cache); return cache; } /* Map inode number to lower fs unique range */ static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid, const char *name, int namelen, bool warn) { unsigned int xinoshift = 64 - xinobits; if (unlikely(ino >> xinoshift)) { if (warn) { pr_warn_ratelimited("d_ino too big (%.*s, ino=%llu, xinobits=%d)\n", namelen, name, ino, xinobits); } return ino; } /* * The lowest xinobit is reserved for mapping the non-peresistent inode * numbers range, but this range is only exposed via st_ino, not here. */ return ino | ((u64)fsid) << (xinoshift + 1); } /* * Set d_ino for upper entries if needed. Non-upper entries should always report * the uppermost real inode ino and should not call this function. * * When not all layer are on same fs, report real ino also for upper. * * When all layers are on the same fs, and upper has a reference to * copy up origin, call vfs_getattr() on the overlay entry to make * sure that d_ino will be consistent with st_ino from stat(2). * * Also checks the overlay.whiteout xattr by doing a full lookup which will return * negative in this case. */ static int ovl_cache_update(const struct path *path, struct ovl_cache_entry *p, bool update_ino) { struct dentry *dir = path->dentry; struct ovl_fs *ofs = OVL_FS(dir->d_sb); struct dentry *this = NULL; enum ovl_path_type type; u64 ino = p->real_ino; int xinobits = ovl_xino_bits(ofs); int err = 0; if (!ovl_same_dev(ofs) && !p->check_xwhiteout) goto out; if (p->name[0] == '.') { if (p->len == 1) { this = dget(dir); goto get; } if (p->len == 2 && p->name[1] == '.') { /* we shall not be moved */ this = dget(dir->d_parent); goto get; } } /* This checks also for xwhiteouts */ this = lookup_one(mnt_idmap(path->mnt), &QSTR_LEN(p->name, p->len), dir); if (IS_ERR_OR_NULL(this) || !this->d_inode) { /* Mark a stale entry */ p->is_whiteout = true; if (IS_ERR(this)) { err = PTR_ERR(this); this = NULL; goto fail; } goto out; } get: if (!ovl_same_dev(ofs) || !update_ino) goto out; type = ovl_path_type(this); if (OVL_TYPE_ORIGIN(type)) { struct kstat stat; struct path statpath = *path; statpath.dentry = this; err = vfs_getattr(&statpath, &stat, STATX_INO, 0); if (err) goto fail; /* * Directory inode is always on overlay st_dev. * Non-dir with ovl_same_dev() could be on pseudo st_dev in case * of xino bits overflow. */ WARN_ON_ONCE(S_ISDIR(stat.mode) && dir->d_sb->s_dev != stat.dev); ino = stat.ino; } else if (xinobits && !OVL_TYPE_UPPER(type)) { ino = ovl_remap_lower_ino(ino, xinobits, ovl_layer_lower(this)->fsid, p->name, p->len, ovl_xino_warn(ofs)); } out: p->ino = ino; dput(this); return err; fail: pr_warn_ratelimited("failed to look up (%s) for ino (%i)\n", p->name, err); goto out; } static bool ovl_fill_plain(struct dir_context *ctx, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { struct ovl_cache_entry *p; struct ovl_readdir_data *rdd = container_of(ctx, struct ovl_readdir_data, ctx); rdd->count++; p = ovl_cache_entry_new(rdd, name, namelen, NULL, 0, ino, d_type); if (p == NULL) { rdd->err = -ENOMEM; return false; } list_add_tail(&p->l_node, rdd->list); return true; } static int ovl_dir_read_impure(const struct path *path, struct list_head *list, struct rb_root *root) { int err; struct path realpath; struct ovl_cache_entry *p, *n; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_plain, .ctx.count = INT_MAX, .list = list, .root = root, }; INIT_LIST_HEAD(list); *root = RB_ROOT; ovl_path_upper(path->dentry, &realpath); err = ovl_dir_read(&realpath, &rdd); if (err) return err; list_for_each_entry_safe(p, n, list, l_node) { if (strcmp(p->name, ".") != 0 && strcmp(p->name, "..") != 0) { err = ovl_cache_update(path, p, true); if (err) return err; } if (p->ino == p->real_ino) { list_del(&p->l_node); ovl_cache_entry_free(p); } else { struct rb_node **newp = &root->rb_node; struct rb_node *parent = NULL; if (WARN_ON(ovl_cache_entry_find_link(p->name, p->len, &newp, &parent))) return -EIO; rb_link_node(&p->node, parent, newp); rb_insert_color(&p->node, root); } } return 0; } static struct ovl_dir_cache *ovl_cache_get_impure(const struct path *path) { int res; struct dentry *dentry = path->dentry; struct inode *inode = d_inode(dentry); struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct ovl_dir_cache *cache; cache = ovl_dir_cache(inode); if (cache && ovl_inode_version_get(inode) == cache->version) return cache; /* Impure cache is not refcounted, free it here */ ovl_dir_cache_free(inode); ovl_set_dir_cache(inode, NULL); cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL); if (!cache) return ERR_PTR(-ENOMEM); res = ovl_dir_read_impure(path, &cache->entries, &cache->root); if (res) { ovl_cache_free(&cache->entries); kfree(cache); return ERR_PTR(res); } if (list_empty(&cache->entries)) { /* * A good opportunity to get rid of an unneeded "impure" flag. * Removing the "impure" xattr is best effort. */ if (!ovl_want_write(dentry)) { ovl_removexattr(ofs, ovl_dentry_upper(dentry), OVL_XATTR_IMPURE); ovl_drop_write(dentry); } ovl_clear_flag(OVL_IMPURE, inode); kfree(cache); return NULL; } cache->version = ovl_inode_version_get(inode); ovl_set_dir_cache(inode, cache); return cache; } struct ovl_readdir_translate { struct dir_context *orig_ctx; struct ovl_dir_cache *cache; struct dir_context ctx; u64 parent_ino; int fsid; int xinobits; bool xinowarn; }; static bool ovl_fill_real(struct dir_context *ctx, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { struct ovl_readdir_translate *rdt = container_of(ctx, struct ovl_readdir_translate, ctx); struct dir_context *orig_ctx = rdt->orig_ctx; bool res; if (rdt->parent_ino && strcmp(name, "..") == 0) { ino = rdt->parent_ino; } else if (rdt->cache) { struct ovl_cache_entry *p; p = ovl_cache_entry_find(&rdt->cache->root, name, namelen); if (p) ino = p->ino; } else if (rdt->xinobits) { ino = ovl_remap_lower_ino(ino, rdt->xinobits, rdt->fsid, name, namelen, rdt->xinowarn); } res = orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type); ctx->count = orig_ctx->count; return res; } static bool ovl_is_impure_dir(struct file *file) { struct ovl_dir_file *od = file->private_data; struct inode *dir = file_inode(file); /* * Only upper dir can be impure, but if we are in the middle of * iterating a lower real dir, dir could be copied up and marked * impure. We only want the impure cache if we started iterating * a real upper dir to begin with. */ return od->is_upper && ovl_test_flag(OVL_IMPURE, dir); } static int ovl_iterate_real(struct file *file, struct dir_context *ctx) { int err; struct ovl_dir_file *od = file->private_data; struct dentry *dir = file->f_path.dentry; struct ovl_fs *ofs = OVL_FS(dir->d_sb); const struct ovl_layer *lower_layer = ovl_layer_lower(dir); struct ovl_readdir_translate rdt = { .ctx.actor = ovl_fill_real, .ctx.count = ctx->count, .orig_ctx = ctx, .xinobits = ovl_xino_bits(ofs), .xinowarn = ovl_xino_warn(ofs), }; if (rdt.xinobits && lower_layer) rdt.fsid = lower_layer->fsid; if (OVL_TYPE_MERGE(ovl_path_type(dir->d_parent))) { struct kstat stat; struct path statpath = file->f_path; statpath.dentry = dir->d_parent; err = vfs_getattr(&statpath, &stat, STATX_INO, 0); if (err) return err; WARN_ON_ONCE(dir->d_sb->s_dev != stat.dev); rdt.parent_ino = stat.ino; } if (ovl_is_impure_dir(file)) { rdt.cache = ovl_cache_get_impure(&file->f_path); if (IS_ERR(rdt.cache)) return PTR_ERR(rdt.cache); } err = iterate_dir(od->realfile, &rdt.ctx); ctx->pos = rdt.ctx.pos; return err; } static int ovl_iterate_merged(struct file *file, struct dir_context *ctx) { struct ovl_dir_file *od = file->private_data; struct dentry *dentry = file->f_path.dentry; struct ovl_cache_entry *p; int err = 0; if (!od->cache) { struct ovl_dir_cache *cache; cache = ovl_cache_get(dentry); err = PTR_ERR(cache); if (IS_ERR(cache)) return err; od->cache = cache; ovl_seek_cursor(od, ctx->pos); } while (od->cursor != &od->cache->entries) { p = list_entry(od->cursor, struct ovl_cache_entry, l_node); if (!p->is_whiteout) { if (!p->ino || p->check_xwhiteout) { err = ovl_cache_update(&file->f_path, p, !p->ino); if (err) return err; } } /* ovl_cache_update() sets is_whiteout on stale entry */ if (!p->is_whiteout) { if (!dir_emit(ctx, p->name, p->len, p->ino, p->type)) break; } od->cursor = p->l_node.next; ctx->pos++; } return err; } static bool ovl_need_adjust_d_ino(struct file *file) { struct dentry *dentry = file->f_path.dentry; struct ovl_fs *ofs = OVL_FS(dentry->d_sb); /* If parent is merge, then need to adjust d_ino for '..' */ if (ovl_xino_bits(ofs)) return true; /* Can't do consistent inode numbering */ if (!ovl_same_fs(ofs)) return false; /* If dir is impure then need to adjust d_ino for copied up entries */ if (ovl_is_impure_dir(file) || OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent))) return true; /* Pure: no need to adjust d_ino */ return false; } static int ovl_iterate(struct file *file, struct dir_context *ctx) { struct ovl_dir_file *od = file->private_data; if (!ctx->pos) ovl_dir_reset(file); with_ovl_creds(file_dentry(file)->d_sb) { if (!od->is_real) return ovl_iterate_merged(file, ctx); if (ovl_need_adjust_d_ino(file)) return ovl_iterate_real(file, ctx); return iterate_dir(od->realfile, ctx); } } static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin) { loff_t res; struct ovl_dir_file *od = file->private_data; inode_lock(file_inode(file)); if (!file->f_pos) ovl_dir_reset(file); if (od->is_real) { res = vfs_llseek(od->realfile, offset, origin); file->f_pos = od->realfile->f_pos; } else { res = -EINVAL; switch (origin) { case SEEK_CUR: offset += file->f_pos; break; case SEEK_SET: break; default: goto out_unlock; } if (offset < 0) goto out_unlock; if (offset != file->f_pos) { file->f_pos = offset; if (od->cache) ovl_seek_cursor(od, offset); } res = offset; } out_unlock: inode_unlock(file_inode(file)); return res; } static struct file *ovl_dir_open_realfile(const struct file *file, const struct path *realpath) { with_ovl_creds(file_inode(file)->i_sb) return ovl_path_open(realpath, O_RDONLY | (file->f_flags & O_LARGEFILE)); } /* * Like ovl_real_fdget(), returns upperfile if dir was copied up since open. * Unlike ovl_real_fdget(), this caches upperfile in file->private_data. * * TODO: use same abstract type for file->private_data of dir and file so * upperfile could also be cached for files as well. */ struct file *ovl_dir_real_file(const struct file *file, bool want_upper) { struct ovl_dir_file *od = file->private_data; struct dentry *dentry = file->f_path.dentry; struct file *old, *realfile = od->realfile; if (!OVL_TYPE_UPPER(ovl_path_type(dentry))) return want_upper ? NULL : realfile; /* * Need to check if we started out being a lower dir, but got copied up */ if (!od->is_upper) { realfile = READ_ONCE(od->upperfile); if (!realfile) { struct path upperpath; ovl_path_upper(dentry, &upperpath); realfile = ovl_dir_open_realfile(file, &upperpath); if (IS_ERR(realfile)) return realfile; old = cmpxchg_release(&od->upperfile, NULL, realfile); if (old) { fput(realfile); realfile = old; } } } return realfile; } static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct file *realfile; int err; err = ovl_sync_status(OVL_FS(file_inode(file)->i_sb)); if (err <= 0) return err; realfile = ovl_dir_real_file(file, true); err = PTR_ERR_OR_ZERO(realfile); /* Nothing to sync for lower */ if (!realfile || err) return err; return vfs_fsync_range(realfile, start, end, datasync); } static int ovl_dir_release(struct inode *inode, struct file *file) { struct ovl_dir_file *od = file->private_data; if (od->cache) { inode_lock(inode); ovl_cache_put(od, inode); inode_unlock(inode); } fput(od->realfile); if (od->upperfile) fput(od->upperfile); kfree(od); return 0; } static int ovl_dir_open(struct inode *inode, struct file *file) { struct path realpath; struct file *realfile; struct ovl_dir_file *od; enum ovl_path_type type; od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL); if (!od) return -ENOMEM; type = ovl_path_real(file->f_path.dentry, &realpath); realfile = ovl_dir_open_realfile(file, &realpath); if (IS_ERR(realfile)) { kfree(od); return PTR_ERR(realfile); } od->realfile = realfile; od->is_real = ovl_dir_is_real(inode); od->is_upper = OVL_TYPE_UPPER(type); file->private_data = od; return 0; } WRAP_DIR_ITER(ovl_iterate) // FIXME! const struct file_operations ovl_dir_operations = { .read = generic_read_dir, .open = ovl_dir_open, .iterate_shared = shared_ovl_iterate, .llseek = ovl_dir_llseek, .fsync = ovl_dir_fsync, .release = ovl_dir_release, .setlease = generic_setlease, }; int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list) { int err; struct ovl_cache_entry *p, *n; struct rb_root root = RB_ROOT; with_ovl_creds(dentry->d_sb) err = ovl_dir_read_merged(dentry, list, &root); if (err) return err; err = 0; list_for_each_entry_safe(p, n, list, l_node) { /* * Select whiteouts in upperdir, they should * be cleared when deleting this directory. */ if (p->is_whiteout) { if (p->is_upper) continue; goto del_entry; } if (p->name[0] == '.') { if (p->len == 1) goto del_entry; if (p->len == 2 && p->name[1] == '.') goto del_entry; } err = -ENOTEMPTY; break; del_entry: list_del(&p->l_node); ovl_cache_entry_free(p); } return err; } void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper, struct list_head *list) { struct ovl_cache_entry *p; list_for_each_entry(p, list, l_node) { struct dentry *dentry; if (WARN_ON(!p->is_whiteout || !p->is_upper)) continue; dentry = ovl_lookup_upper_unlocked(ofs, p->name, upper, p->len); if (IS_ERR(dentry)) { pr_err("lookup '%s/%.*s' failed (%i)\n", upper->d_name.name, p->len, p->name, (int) PTR_ERR(dentry)); continue; } if (dentry->d_inode) ovl_cleanup(ofs, upper, dentry); dput(dentry); } } static bool ovl_check_d_type(struct dir_context *ctx, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { struct ovl_readdir_data *rdd = container_of(ctx, struct ovl_readdir_data, ctx); /* Even if d_type is not supported, DT_DIR is returned for . and .. */ if (!strncmp(name, ".", namelen) || !strncmp(name, "..", namelen)) return true; if (d_type != DT_UNKNOWN) rdd->d_type_supported = true; return true; } /* * Returns 1 if d_type is supported, 0 not supported/unknown. Negative values * if error is encountered. */ int ovl_check_d_type_supported(const struct path *realpath) { int err; struct ovl_readdir_data rdd = { .ctx.actor = ovl_check_d_type, .ctx.count = INT_MAX, .d_type_supported = false, }; err = ovl_dir_read(realpath, &rdd); if (err) return err; return rdd.d_type_supported; } #define OVL_INCOMPATDIR_NAME "incompat" static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, const struct path *path, int level) { int err; LIST_HEAD(list); struct ovl_cache_entry *p; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_plain, .ctx.count = INT_MAX, .list = &list, }; bool incompat = false; /* * The "work/incompat" directory is treated specially - if it is not * empty, instead of printing a generic error and mounting read-only, * we will error about incompat features and fail the mount. * * When called from ovl_indexdir_cleanup(), path->dentry->d_name.name * starts with '#'. */ if (level == 2 && !strcmp(path->dentry->d_name.name, OVL_INCOMPATDIR_NAME)) incompat = true; err = ovl_dir_read(path, &rdd); if (err) goto out; list_for_each_entry(p, &list, l_node) { struct dentry *dentry; if (p->name[0] == '.') { if (p->len == 1) continue; if (p->len == 2 && p->name[1] == '.') continue; } else if (incompat) { pr_err("overlay with incompat feature '%s' cannot be mounted\n", p->name); err = -EINVAL; break; } dentry = ovl_lookup_upper_unlocked(ofs, p->name, path->dentry, p->len); if (IS_ERR(dentry)) continue; if (dentry->d_inode) err = ovl_workdir_cleanup(ofs, path->dentry, path->mnt, dentry, level); dput(dentry); if (err) break; } out: ovl_cache_free(&list); return err; } int ovl_workdir_cleanup(struct ovl_fs *ofs, struct dentry *parent, struct vfsmount *mnt, struct dentry *dentry, int level) { int err; if (!d_is_dir(dentry) || level > 1) return ovl_cleanup(ofs, parent, dentry); dentry = start_removing_dentry(parent, dentry); if (IS_ERR(dentry)) return PTR_ERR(dentry); err = ovl_do_rmdir(ofs, parent->d_inode, dentry); end_removing(dentry); if (err) { struct path path = { .mnt = mnt, .dentry = dentry }; err = ovl_workdir_cleanup_recurse(ofs, &path, level + 1); if (!err) err = ovl_cleanup(ofs, parent, dentry); } return err; } int ovl_indexdir_cleanup(struct ovl_fs *ofs) { int err; struct dentry *indexdir = ofs->workdir; struct dentry *index = NULL; struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = indexdir }; LIST_HEAD(list); struct ovl_cache_entry *p; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_plain, .ctx.count = INT_MAX, .list = &list, }; err = ovl_dir_read(&path, &rdd); if (err) goto out; list_for_each_entry(p, &list, l_node) { if (p->name[0] == '.') { if (p->len == 1) continue; if (p->len == 2 && p->name[1] == '.') continue; } index = ovl_lookup_upper_unlocked(ofs, p->name, indexdir, p->len); if (IS_ERR(index)) { err = PTR_ERR(index); index = NULL; break; } /* Cleanup leftover from index create/cleanup attempt */ if (index->d_name.name[0] == '#') { err = ovl_workdir_cleanup(ofs, indexdir, path.mnt, index, 1); if (err) break; goto next; } err = ovl_verify_index(ofs, index); if (!err) { goto next; } else if (err == -ESTALE) { /* Cleanup stale index entries */ err = ovl_cleanup(ofs, indexdir, index); } else if (err != -ENOENT) { /* * Abort mount to avoid corrupting the index if * an incompatible index entry was found or on out * of memory. */ break; } else if (ofs->config.nfs_export) { /* * Whiteout orphan index to block future open by * handle after overlay nlink dropped to zero. */ err = ovl_cleanup_and_whiteout(ofs, indexdir, index); } else { /* Cleanup orphan index entries */ err = ovl_cleanup(ofs, indexdir, index); } if (err) break; next: dput(index); index = NULL; } dput(index); out: ovl_cache_free(&list); if (err) pr_err("failed index dir cleanup (%i)\n", err); return err; } |
| 741 153 153 4 4 4 4 4 4 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2025 Oracle. All Rights Reserved. * Author: Darrick J. Wong <djwong@kernel.org> */ #include <linux/fs.h> #include <linux/fsnotify.h> #include <linux/mempool.h> #include <linux/fserror.h> #define FSERROR_DEFAULT_EVENT_POOL_SIZE (32) static struct mempool fserror_events_pool; void fserror_mount(struct super_block *sb) { /* * The pending error counter is biased by 1 so that we don't wake_var * until we're actually trying to unmount. */ refcount_set(&sb->s_pending_errors, 1); } void fserror_unmount(struct super_block *sb) { /* * If we don't drop the pending error count to zero, then wait for it * to drop below 1, which means that the pending errors cleared and * hopefully we didn't saturate with 1 billion+ concurrent events. */ if (!refcount_dec_and_test(&sb->s_pending_errors)) wait_var_event(&sb->s_pending_errors, refcount_read(&sb->s_pending_errors) < 1); } static inline void fserror_pending_dec(struct super_block *sb) { if (refcount_dec_and_test(&sb->s_pending_errors)) wake_up_var(&sb->s_pending_errors); } static inline void fserror_free_event(struct fserror_event *event) { fserror_pending_dec(event->sb); mempool_free(event, &fserror_events_pool); } static void fserror_worker(struct work_struct *work) { struct fserror_event *event = container_of(work, struct fserror_event, work); struct super_block *sb = event->sb; if (sb->s_flags & SB_ACTIVE) { struct fs_error_report report = { /* send positive error number to userspace */ .error = -event->error, .inode = event->inode, .sb = event->sb, }; if (sb->s_op->report_error) sb->s_op->report_error(event); fsnotify(FS_ERROR, &report, FSNOTIFY_EVENT_ERROR, NULL, NULL, NULL, 0); } iput(event->inode); fserror_free_event(event); } static inline struct fserror_event *fserror_alloc_event(struct super_block *sb, gfp_t gfp_flags) { struct fserror_event *event = NULL; /* * If pending_errors already reached zero or is no longer active, * the superblock is being deactivated so there's no point in * continuing. * * The order of the check of s_pending_errors and SB_ACTIVE are * mandated by order of accesses in generic_shutdown_super and * fserror_unmount. Barriers are implicitly provided by the refcount * manipulations in this function and fserror_unmount. */ if (!refcount_inc_not_zero(&sb->s_pending_errors)) return NULL; if (!(sb->s_flags & SB_ACTIVE)) goto out_pending; event = mempool_alloc(&fserror_events_pool, gfp_flags); if (!event) goto out_pending; /* mempool_alloc doesn't support GFP_ZERO */ memset(event, 0, sizeof(*event)); event->sb = sb; INIT_WORK(&event->work, fserror_worker); return event; out_pending: fserror_pending_dec(sb); return NULL; } /** * fserror_report - report a filesystem error of some kind * * @sb: superblock of the filesystem * @inode: inode within that filesystem, if applicable * @type: type of error encountered * @pos: start of inode range affected, if applicable * @len: length of inode range affected, if applicable * @error: error number encountered, must be negative * @gfp: memory allocation flags for conveying the event to a worker, * since this function can be called from atomic contexts * * Report details of a filesystem error to the super_operations::report_error * callback if present; and to fsnotify for distribution to userspace. @sb, * @gfp, @type, and @error must all be specified. For file I/O errors, the * @inode, @pos, and @len fields must also be specified. For file metadata * errors, @inode must be specified. If @inode is not NULL, then @inode->i_sb * must point to @sb. * * Reporting work is deferred to a workqueue to ensure that ->report_error is * called from process context without any locks held. An active reference to * the inode is maintained until event handling is complete, and unmount will * wait for queued events to drain. */ void fserror_report(struct super_block *sb, struct inode *inode, enum fserror_type type, loff_t pos, u64 len, int error, gfp_t gfp) { struct fserror_event *event; /* sb and inode must be from the same filesystem */ WARN_ON_ONCE(inode && inode->i_sb != sb); /* error number must be negative */ WARN_ON_ONCE(error >= 0); event = fserror_alloc_event(sb, gfp); if (!event) goto lost; event->type = type; event->pos = pos; event->len = len; event->error = error; /* * Can't iput from non-sleeping context, so grabbing another reference * to the inode must be the last thing before submitting the event. */ if (inode) { event->inode = igrab(inode); if (!event->inode) goto lost_event; } /* * Use schedule_work here even if we're already in process context so * that fsnotify and super_operations::report_error implementations are * guaranteed to run in process context without any locks held. Since * errors are supposed to be rare, the overhead shouldn't kill us any * more than the failing device will. */ schedule_work(&event->work); return; lost_event: fserror_free_event(event); lost: if (inode) pr_err_ratelimited( "%s: lost file I/O error report for ino %lu type %u pos 0x%llx len 0x%llx error %d", sb->s_id, inode->i_ino, type, pos, len, error); else pr_err_ratelimited( "%s: lost filesystem error report for type %u error %d", sb->s_id, type, error); } EXPORT_SYMBOL_GPL(fserror_report); static int __init fserror_init(void) { return mempool_init_kmalloc_pool(&fserror_events_pool, FSERROR_DEFAULT_EVENT_POOL_SIZE, sizeof(struct fserror_event)); } fs_initcall(fserror_init); |
| 587 8 8 8 8 8 8 8 5 6 6 6 6 6 5 6 6 6 5 5 5 5 6 186 186 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 | /* * Performance events x86 architecture code * * Copyright (C) 2008 Linutronix GmbH, Thomas Gleixner <tglx@kernel.org> * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> * Copyright (C) 2009 Google, Inc., Stephane Eranian * * For licencing details see kernel-base/COPYING */ #include <linux/perf_event.h> #include <linux/capability.h> #include <linux/notifier.h> #include <linux/hardirq.h> #include <linux/kprobes.h> #include <linux/export.h> #include <linux/init.h> #include <linux/kdebug.h> #include <linux/kvm_types.h> #include <linux/sched/mm.h> #include <linux/sched/clock.h> #include <linux/uaccess.h> #include <linux/slab.h> #include <linux/cpu.h> #include <linux/bitops.h> #include <linux/device.h> #include <linux/nospec.h> #include <linux/static_call.h> #include <linux/kvm_types.h> #include <asm/apic.h> #include <asm/stacktrace.h> #include <asm/msr.h> #include <asm/nmi.h> #include <asm/smp.h> #include <asm/alternative.h> #include <asm/mmu_context.h> #include <asm/tlbflush.h> #include <asm/timer.h> #include <asm/desc.h> #include <asm/ldt.h> #include <asm/unwind.h> #include <asm/uprobes.h> #include <asm/ibt.h> #include "perf_event.h" struct x86_pmu x86_pmu __read_mostly; static struct pmu pmu; DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, .pmu = &pmu, }; static DEFINE_PER_CPU(bool, guest_lvtpc_loaded); DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key); DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key); DEFINE_STATIC_KEY_FALSE(perf_is_hybrid); /* * This here uses DEFINE_STATIC_CALL_NULL() to get a static_call defined * from just a typename, as opposed to an actual function. */ DEFINE_STATIC_CALL_NULL(x86_pmu_handle_irq, *x86_pmu.handle_irq); DEFINE_STATIC_CALL_NULL(x86_pmu_disable_all, *x86_pmu.disable_all); DEFINE_STATIC_CALL_NULL(x86_pmu_enable_all, *x86_pmu.enable_all); DEFINE_STATIC_CALL_NULL(x86_pmu_enable, *x86_pmu.enable); DEFINE_STATIC_CALL_NULL(x86_pmu_disable, *x86_pmu.disable); DEFINE_STATIC_CALL_NULL(x86_pmu_assign, *x86_pmu.assign); DEFINE_STATIC_CALL_NULL(x86_pmu_add, *x86_pmu.add); DEFINE_STATIC_CALL_NULL(x86_pmu_del, *x86_pmu.del); DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read); DEFINE_STATIC_CALL_NULL(x86_pmu_set_period, *x86_pmu.set_period); DEFINE_STATIC_CALL_NULL(x86_pmu_update, *x86_pmu.update); DEFINE_STATIC_CALL_NULL(x86_pmu_limit_period, *x86_pmu.limit_period); DEFINE_STATIC_CALL_NULL(x86_pmu_schedule_events, *x86_pmu.schedule_events); DEFINE_STATIC_CALL_NULL(x86_pmu_get_event_constraints, *x86_pmu.get_event_constraints); DEFINE_STATIC_CALL_NULL(x86_pmu_put_event_constraints, *x86_pmu.put_event_constraints); DEFINE_STATIC_CALL_NULL(x86_pmu_start_scheduling, *x86_pmu.start_scheduling); DEFINE_STATIC_CALL_NULL(x86_pmu_commit_scheduling, *x86_pmu.commit_scheduling); DEFINE_STATIC_CALL_NULL(x86_pmu_stop_scheduling, *x86_pmu.stop_scheduling); DEFINE_STATIC_CALL_NULL(x86_pmu_sched_task, *x86_pmu.sched_task); DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs); DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases); DEFINE_STATIC_CALL_NULL(x86_pmu_filter, *x86_pmu.filter); DEFINE_STATIC_CALL_NULL(x86_pmu_late_setup, *x86_pmu.late_setup); DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_enable, *x86_pmu.pebs_enable); DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_disable, *x86_pmu.pebs_disable); DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_enable_all, *x86_pmu.pebs_enable_all); DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_disable_all, *x86_pmu.pebs_disable_all); /* * This one is magic, it will get called even when PMU init fails (because * there is no PMU), in which case it should simply return NULL. */ DEFINE_STATIC_CALL_RET0(x86_pmu_guest_get_msrs, *x86_pmu.guest_get_msrs); u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; u64 __read_mostly hw_cache_extra_regs [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; /* * Propagate event elapsed time into the generic event. * Can only be executed on the CPU where the event is active. * Returns the delta events processed. */ u64 x86_perf_event_update(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; int shift = 64 - x86_pmu.cntval_bits; u64 prev_raw_count, new_raw_count; u64 delta; if (unlikely(!hwc->event_base)) return 0; /* * Careful: an NMI might modify the previous event value. * * Our tactic to handle this is to first atomically read and * exchange a new raw count - then add that new-prev delta * count to the generic event atomically: */ prev_raw_count = local64_read(&hwc->prev_count); do { new_raw_count = rdpmc(hwc->event_base_rdpmc); } while (!local64_try_cmpxchg(&hwc->prev_count, &prev_raw_count, new_raw_count)); /* * Now we have the new raw value and have updated the prev * timestamp already. We can now calculate the elapsed delta * (event-)time and add that to the generic event. * * Careful, not all hw sign-extends above the physical width * of the count. */ delta = (new_raw_count << shift) - (prev_raw_count << shift); delta >>= shift; local64_add(delta, &event->count); local64_sub(delta, &hwc->period_left); return new_raw_count; } /* * Find and validate any extra registers to set up. */ static int x86_pmu_extra_regs(u64 config, struct perf_event *event) { struct extra_reg *extra_regs = hybrid(event->pmu, extra_regs); struct hw_perf_event_extra *reg; struct extra_reg *er; reg = &event->hw.extra_reg; if (!extra_regs) return 0; for (er = extra_regs; er->msr; er++) { if (er->event != (config & er->config_mask)) continue; if (event->attr.config1 & ~er->valid_mask) return -EINVAL; /* Check if the extra msrs can be safely accessed*/ if (!er->extra_msr_access) return -ENXIO; reg->idx = er->idx; reg->config = event->attr.config1; reg->reg = er->msr; break; } return 0; } static atomic_t active_events; static atomic_t pmc_refcount; static DEFINE_MUTEX(pmc_reserve_mutex); #ifdef CONFIG_X86_LOCAL_APIC static inline u64 get_possible_counter_mask(void) { u64 cntr_mask = x86_pmu.cntr_mask64; int i; if (!is_hybrid()) return cntr_mask; for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) cntr_mask |= x86_pmu.hybrid_pmu[i].cntr_mask64; return cntr_mask; } static bool reserve_pmc_hardware(void) { u64 cntr_mask = get_possible_counter_mask(); int i, end; for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) { if (!reserve_perfctr_nmi(x86_pmu_event_addr(i))) goto perfctr_fail; } for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) { if (!reserve_evntsel_nmi(x86_pmu_config_addr(i))) goto eventsel_fail; } return true; eventsel_fail: end = i; for_each_set_bit(i, (unsigned long *)&cntr_mask, end) release_evntsel_nmi(x86_pmu_config_addr(i)); i = X86_PMC_IDX_MAX; perfctr_fail: end = i; for_each_set_bit(i, (unsigned long *)&cntr_mask, end) release_perfctr_nmi(x86_pmu_event_addr(i)); return false; } static void release_pmc_hardware(void) { u64 cntr_mask = get_possible_counter_mask(); int i; for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) { release_perfctr_nmi(x86_pmu_event_addr(i)); release_evntsel_nmi(x86_pmu_config_addr(i)); } } #else static bool reserve_pmc_hardware(void) { return true; } static void release_pmc_hardware(void) {} #endif bool check_hw_exists(struct pmu *pmu, unsigned long *cntr_mask, unsigned long *fixed_cntr_mask) { u64 val, val_fail = -1, val_new= ~0; int i, reg, reg_fail = -1, ret = 0; int bios_fail = 0; int reg_safe = -1; /* * Check to see if the BIOS enabled any of the counters, if so * complain and bail. */ for_each_set_bit(i, cntr_mask, X86_PMC_IDX_MAX) { reg = x86_pmu_config_addr(i); ret = rdmsrq_safe(reg, &val); if (ret) goto msr_fail; if (val & ARCH_PERFMON_EVENTSEL_ENABLE) { bios_fail = 1; val_fail = val; reg_fail = reg; } else { reg_safe = i; } } if (*(u64 *)fixed_cntr_mask) { reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; ret = rdmsrq_safe(reg, &val); if (ret) goto msr_fail; for_each_set_bit(i, fixed_cntr_mask, X86_PMC_IDX_MAX) { if (fixed_counter_disabled(i, pmu)) continue; if (val & (0x03ULL << i*4)) { bios_fail = 1; val_fail = val; reg_fail = reg; } } } /* * If all the counters are enabled, the below test will always * fail. The tools will also become useless in this scenario. * Just fail and disable the hardware counters. */ if (reg_safe == -1) { reg = reg_safe; goto msr_fail; } /* * Read the current value, change it and read it back to see if it * matches, this is needed to detect certain hardware emulators * (qemu/kvm) that don't trap on the MSR access and always return 0s. */ reg = x86_pmu_event_addr(reg_safe); if (rdmsrq_safe(reg, &val)) goto msr_fail; val ^= 0xffffUL; ret = wrmsrq_safe(reg, val); ret |= rdmsrq_safe(reg, &val_new); if (ret || val != val_new) goto msr_fail; /* * We still allow the PMU driver to operate: */ if (bios_fail) { pr_cont("Broken BIOS detected, complain to your hardware vendor.\n"); pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg_fail, val_fail); } return true; msr_fail: if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { pr_cont("PMU not available due to virtualization, using software events only.\n"); } else { pr_cont("Broken PMU hardware detected, using software events only.\n"); pr_err("Failed to access perfctr msr (MSR %x is %Lx)\n", reg, val_new); } return false; } static void hw_perf_event_destroy(struct perf_event *event) { x86_release_hardware(); atomic_dec(&active_events); } void hw_perf_lbr_event_destroy(struct perf_event *event) { hw_perf_event_destroy(event); /* undo the lbr/bts event accounting */ x86_del_exclusive(x86_lbr_exclusive_lbr); } static inline int x86_pmu_initialized(void) { return x86_pmu.handle_irq != NULL; } static inline int set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event) { struct perf_event_attr *attr = &event->attr; unsigned int cache_type, cache_op, cache_result; u64 config, val; config = attr->config; cache_type = (config >> 0) & 0xff; if (cache_type >= PERF_COUNT_HW_CACHE_MAX) return -EINVAL; cache_type = array_index_nospec(cache_type, PERF_COUNT_HW_CACHE_MAX); cache_op = (config >> 8) & 0xff; if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) return -EINVAL; cache_op = array_index_nospec(cache_op, PERF_COUNT_HW_CACHE_OP_MAX); cache_result = (config >> 16) & 0xff; if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) return -EINVAL; cache_result = array_index_nospec(cache_result, PERF_COUNT_HW_CACHE_RESULT_MAX); val = hybrid_var(event->pmu, hw_cache_event_ids)[cache_type][cache_op][cache_result]; if (val == 0) return -ENOENT; if (val == -1) return -EINVAL; hwc->config |= val; attr->config1 = hybrid_var(event->pmu, hw_cache_extra_regs)[cache_type][cache_op][cache_result]; return x86_pmu_extra_regs(val, event); } int x86_reserve_hardware(void) { int err = 0; if (!atomic_inc_not_zero(&pmc_refcount)) { mutex_lock(&pmc_reserve_mutex); if (atomic_read(&pmc_refcount) == 0) { if (!reserve_pmc_hardware()) { err = -EBUSY; } else { reserve_ds_buffers(); reserve_lbr_buffers(); } } if (!err) atomic_inc(&pmc_refcount); mutex_unlock(&pmc_reserve_mutex); } return err; } void x86_release_hardware(void) { if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) { release_pmc_hardware(); release_ds_buffers(); release_lbr_buffers(); mutex_unlock(&pmc_reserve_mutex); } } /* * Check if we can create event of a certain type (that no conflicting events * are present). */ int x86_add_exclusive(unsigned int what) { int i; /* * When lbr_pt_coexist we allow PT to coexist with either LBR or BTS. * LBR and BTS are still mutually exclusive. */ if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt) goto out; if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) { mutex_lock(&pmc_reserve_mutex); for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) { if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i])) goto fail_unlock; } atomic_inc(&x86_pmu.lbr_exclusive[what]); mutex_unlock(&pmc_reserve_mutex); } out: atomic_inc(&active_events); return 0; fail_unlock: mutex_unlock(&pmc_reserve_mutex); return -EBUSY; } void x86_del_exclusive(unsigned int what) { atomic_dec(&active_events); /* * See the comment in x86_add_exclusive(). */ if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt) return; atomic_dec(&x86_pmu.lbr_exclusive[what]); } int x86_setup_perfctr(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; struct hw_perf_event *hwc = &event->hw; u64 config; if (!is_sampling_event(event)) { hwc->sample_period = x86_pmu.max_period; hwc->last_period = hwc->sample_period; local64_set(&hwc->period_left, hwc->sample_period); } if (attr->type == event->pmu->type) return x86_pmu_extra_regs(event->attr.config, event); if (attr->type == PERF_TYPE_HW_CACHE) return set_ext_hw_attr(hwc, event); if (attr->config >= x86_pmu.max_events) return -EINVAL; attr->config = array_index_nospec((unsigned long)attr->config, x86_pmu.max_events); /* * The generic map: */ config = x86_pmu.event_map(attr->config); if (config == 0) return -ENOENT; if (config == -1LL) return -EINVAL; hwc->config |= config; return 0; } /* * check that branch_sample_type is compatible with * settings needed for precise_ip > 1 which implies * using the LBR to capture ALL taken branches at the * priv levels of the measurement */ static inline int precise_br_compat(struct perf_event *event) { u64 m = event->attr.branch_sample_type; u64 b = 0; /* must capture all branches */ if (!(m & PERF_SAMPLE_BRANCH_ANY)) return 0; m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER; if (!event->attr.exclude_user) b |= PERF_SAMPLE_BRANCH_USER; if (!event->attr.exclude_kernel) b |= PERF_SAMPLE_BRANCH_KERNEL; /* * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86 */ return m == b; } int x86_pmu_max_precise(struct pmu *pmu) { int precise = 0; if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { /* arch PEBS */ if (x86_pmu.arch_pebs) { precise = 2; if (hybrid(pmu, arch_pebs_cap).pdists) precise++; return precise; } /* legacy PEBS - support for constant skid */ precise++; /* Support for IP fixup */ if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2) precise++; if (x86_pmu.pebs_prec_dist) precise++; } return precise; } int x86_pmu_hw_config(struct perf_event *event) { if (event->attr.precise_ip) { int precise = x86_pmu_max_precise(event->pmu); if (event->attr.precise_ip > precise) return -EOPNOTSUPP; /* There's no sense in having PEBS for non sampling events: */ if (!is_sampling_event(event)) return -EINVAL; } /* * check that PEBS LBR correction does not conflict with * whatever the user is asking with attr->branch_sample_type */ if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) { u64 *br_type = &event->attr.branch_sample_type; if (has_branch_stack(event)) { if (!precise_br_compat(event)) return -EOPNOTSUPP; /* branch_sample_type is compatible */ } else { /* * user did not specify branch_sample_type * * For PEBS fixups, we capture all * the branches at the priv level of the * event. */ *br_type = PERF_SAMPLE_BRANCH_ANY; if (!event->attr.exclude_user) *br_type |= PERF_SAMPLE_BRANCH_USER; if (!event->attr.exclude_kernel) *br_type |= PERF_SAMPLE_BRANCH_KERNEL; } } if (branch_sample_call_stack(event)) event->attach_state |= PERF_ATTACH_TASK_DATA; /* * Generate PMC IRQs: * (keep 'enabled' bit clear for now) */ event->hw.config = ARCH_PERFMON_EVENTSEL_INT; /* * Count user and OS events unless requested not to */ if (!event->attr.exclude_user) event->hw.config |= ARCH_PERFMON_EVENTSEL_USR; if (!event->attr.exclude_kernel) event->hw.config |= ARCH_PERFMON_EVENTSEL_OS; if (event->attr.type == event->pmu->type) event->hw.config |= x86_pmu_get_event_config(event); if (is_sampling_event(event) && !event->attr.freq && x86_pmu.limit_period) { s64 left = event->attr.sample_period; x86_pmu.limit_period(event, &left); if (left > event->attr.sample_period) return -EINVAL; } /* sample_regs_user never support XMM registers */ if (unlikely(event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK)) return -EINVAL; /* * Besides the general purpose registers, XMM registers may * be collected in PEBS on some platforms, e.g. Icelake */ if (unlikely(event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK)) { if (!(event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS)) return -EINVAL; if (!event->attr.precise_ip) return -EINVAL; } return x86_setup_perfctr(event); } /* * Setup the hardware configuration for a given attr_type */ static int __x86_pmu_event_init(struct perf_event *event) { int err; if (!x86_pmu_initialized()) return -ENODEV; err = x86_reserve_hardware(); if (err) return err; atomic_inc(&active_events); event->destroy = hw_perf_event_destroy; event->hw.idx = -1; event->hw.last_cpu = -1; event->hw.last_tag = ~0ULL; event->hw.dyn_constraint = ~0ULL; /* mark unused */ event->hw.extra_reg.idx = EXTRA_REG_NONE; event->hw.branch_reg.idx = EXTRA_REG_NONE; return x86_pmu.hw_config(event); } void x86_pmu_disable_all(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { struct hw_perf_event *hwc = &cpuc->events[idx]->hw; u64 val; if (!test_bit(idx, cpuc->active_mask)) continue; rdmsrq(x86_pmu_config_addr(idx), val); if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE)) continue; val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrq(x86_pmu_config_addr(idx), val); if (is_counter_pair(hwc)) wrmsrq(x86_pmu_config_addr(idx + 1), 0); } } struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data) { return static_call(x86_pmu_guest_get_msrs)(nr, data); } EXPORT_SYMBOL_FOR_KVM(perf_guest_get_msrs); /* * There may be PMI landing after enabled=0. The PMI hitting could be before or * after disable_all. * * If PMI hits before disable_all, the PMU will be disabled in the NMI handler. * It will not be re-enabled in the NMI handler again, because enabled=0. After * handling the NMI, disable_all will be called, which will not change the * state either. If PMI hits after disable_all, the PMU is already disabled * before entering NMI handler. The NMI handler will not change the state * either. * * So either situation is harmless. */ static void x86_pmu_disable(struct pmu *pmu) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); if (!x86_pmu_initialized()) return; if (!cpuc->enabled) return; cpuc->n_added = 0; cpuc->enabled = 0; barrier(); static_call(x86_pmu_disable_all)(); } void x86_pmu_enable_all(int added) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { struct hw_perf_event *hwc = &cpuc->events[idx]->hw; if (!test_bit(idx, cpuc->active_mask)) continue; __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); } } int is_x86_event(struct perf_event *event) { /* * For a non-hybrid platforms, the type of X86 pmu is * always PERF_TYPE_RAW. * For a hybrid platform, the PERF_PMU_CAP_EXTENDED_HW_TYPE * is a unique capability for the X86 PMU. * Use them to detect a X86 event. */ if (event->pmu->type == PERF_TYPE_RAW || event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE) return true; return false; } struct pmu *x86_get_pmu(unsigned int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); /* * All CPUs of the hybrid type have been offline. * The x86_get_pmu() should not be invoked. */ if (WARN_ON_ONCE(!cpuc->pmu)) return &pmu; return cpuc->pmu; } /* * Event scheduler state: * * Assign events iterating over all events and counters, beginning * with events with least weights first. Keep the current iterator * state in struct sched_state. */ struct sched_state { int weight; int event; /* event index */ int counter; /* counter index */ int unassigned; /* number of events to be assigned left */ int nr_gp; /* number of GP counters used */ u64 used; }; /* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */ #define SCHED_STATES_MAX 2 struct perf_sched { int max_weight; int max_events; int max_gp; int saved_states; struct event_constraint **constraints; struct sched_state state; struct sched_state saved[SCHED_STATES_MAX]; }; /* * Initialize iterator that runs through all events and counters. */ static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints, int num, int wmin, int wmax, int gpmax) { int idx; memset(sched, 0, sizeof(*sched)); sched->max_events = num; sched->max_weight = wmax; sched->max_gp = gpmax; sched->constraints = constraints; for (idx = 0; idx < num; idx++) { if (constraints[idx]->weight == wmin) break; } sched->state.event = idx; /* start with min weight */ sched->state.weight = wmin; sched->state.unassigned = num; } static void perf_sched_save_state(struct perf_sched *sched) { if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX)) return; sched->saved[sched->saved_states] = sched->state; sched->saved_states++; } static bool perf_sched_restore_state(struct perf_sched *sched) { if (!sched->saved_states) return false; sched->saved_states--; sched->state = sched->saved[sched->saved_states]; /* this assignment didn't work out */ /* XXX broken vs EVENT_PAIR */ sched->state.used &= ~BIT_ULL(sched->state.counter); /* try the next one */ sched->state.counter++; return true; } /* * Select a counter for the current event to schedule. Return true on * success. */ static bool __perf_sched_find_counter(struct perf_sched *sched) { struct event_constraint *c; int idx; if (!sched->state.unassigned) return false; if (sched->state.event >= sched->max_events) return false; c = sched->constraints[sched->state.event]; /* Prefer fixed purpose counters */ if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) { idx = INTEL_PMC_IDX_FIXED; for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) { u64 mask = BIT_ULL(idx); if (sched->state.used & mask) continue; sched->state.used |= mask; goto done; } } /* Grab the first unused counter starting with idx */ idx = sched->state.counter; for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) { u64 mask = BIT_ULL(idx); if (c->flags & PERF_X86_EVENT_PAIR) mask |= mask << 1; if (sched->state.used & mask) continue; if (sched->state.nr_gp++ >= sched->max_gp) return false; sched->state.used |= mask; goto done; } return false; done: sched->state.counter = idx; if (c->overlap) perf_sched_save_state(sched); return true; } static bool perf_sched_find_counter(struct perf_sched *sched) { while (!__perf_sched_find_counter(sched)) { if (!perf_sched_restore_state(sched)) return false; } return true; } /* * Go through all unassigned events and find the next one to schedule. * Take events with the least weight first. Return true on success. */ static bool perf_sched_next_event(struct perf_sched *sched) { struct event_constraint *c; if (!sched->state.unassigned || !--sched->state.unassigned) return false; do { /* next event */ sched->state.event++; if (sched->state.event >= sched->max_events) { /* next weight */ sched->state.event = 0; sched->state.weight++; if (sched->state.weight > sched->max_weight) return false; } c = sched->constraints[sched->state.event]; } while (c->weight != sched->state.weight); sched->state.counter = 0; /* start with first counter */ return true; } /* * Assign a counter for each event. */ int perf_assign_events(struct event_constraint **constraints, int n, int wmin, int wmax, int gpmax, int *assign) { struct perf_sched sched; perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax); do { if (!perf_sched_find_counter(&sched)) break; /* failed */ if (assign) assign[sched.state.event] = sched.state.counter; } while (perf_sched_next_event(&sched)); return sched.state.unassigned; } EXPORT_SYMBOL_GPL(perf_assign_events); int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { struct event_constraint *c; struct perf_event *e; int n0, i, wmin, wmax, unsched = 0; struct hw_perf_event *hwc; u64 used_mask = 0; /* * Compute the number of events already present; see x86_pmu_add(), * validate_group() and x86_pmu_commit_txn(). For the former two * cpuc->n_events hasn't been updated yet, while for the latter * cpuc->n_txn contains the number of events added in the current * transaction. */ n0 = cpuc->n_events; if (cpuc->txn_flags & PERF_PMU_TXN_ADD) n0 -= cpuc->n_txn; static_call_cond(x86_pmu_start_scheduling)(cpuc); for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { c = cpuc->event_constraint[i]; /* * Previously scheduled events should have a cached constraint, * while new events should not have one. */ WARN_ON_ONCE((c && i >= n0) || (!c && i < n0)); /* * Request constraints for new events; or for those events that * have a dynamic constraint -- for those the constraint can * change due to external factors (sibling state, allow_tfa). */ if (!c || (c->flags & PERF_X86_EVENT_DYNAMIC)) { c = static_call(x86_pmu_get_event_constraints)(cpuc, i, cpuc->event_list[i]); cpuc->event_constraint[i] = c; } wmin = min(wmin, c->weight); wmax = max(wmax, c->weight); } /* * fastpath, try to reuse previous register */ for (i = 0; i < n; i++) { u64 mask; hwc = &cpuc->event_list[i]->hw; c = cpuc->event_constraint[i]; /* never assigned */ if (hwc->idx == -1) break; /* constraint still honored */ if (!test_bit(hwc->idx, c->idxmsk)) break; mask = BIT_ULL(hwc->idx); if (is_counter_pair(hwc)) mask |= mask << 1; /* not already used */ if (used_mask & mask) break; used_mask |= mask; if (assign) assign[i] = hwc->idx; } /* slow path */ if (i != n) { int gpmax = x86_pmu_max_num_counters(cpuc->pmu); /* * Do not allow scheduling of more than half the available * generic counters. * * This helps avoid counter starvation of sibling thread by * ensuring at most half the counters cannot be in exclusive * mode. There is no designated counters for the limits. Any * N/2 counters can be used. This helps with events with * specific counter constraints. */ if (is_ht_workaround_enabled() && !cpuc->is_fake && READ_ONCE(cpuc->excl_cntrs->exclusive_present)) gpmax /= 2; /* * Reduce the amount of available counters to allow fitting * the extra Merge events needed by large increment events. */ if (x86_pmu.flags & PMU_FL_PAIR) { gpmax -= cpuc->n_pair; WARN_ON(gpmax <= 0); } unsched = perf_assign_events(cpuc->event_constraint, n, wmin, wmax, gpmax, assign); } /* * In case of success (unsched = 0), mark events as committed, * so we do not put_constraint() in case new events are added * and fail to be scheduled * * We invoke the lower level commit callback to lock the resource * * We do not need to do all of this in case we are called to * validate an event group (assign == NULL) */ if (!unsched && assign) { for (i = 0; i < n; i++) static_call_cond(x86_pmu_commit_scheduling)(cpuc, i, assign[i]); } else { for (i = n0; i < n; i++) { e = cpuc->event_list[i]; /* * release events that failed scheduling */ static_call_cond(x86_pmu_put_event_constraints)(cpuc, e); cpuc->event_constraint[i] = NULL; } } static_call_cond(x86_pmu_stop_scheduling)(cpuc); return unsched ? -EINVAL : 0; } static int add_nr_metric_event(struct cpu_hw_events *cpuc, struct perf_event *event) { if (is_metric_event(event)) { if (cpuc->n_metric == INTEL_TD_METRIC_NUM) return -EINVAL; cpuc->n_metric++; cpuc->n_txn_metric++; } return 0; } static void del_nr_metric_event(struct cpu_hw_events *cpuc, struct perf_event *event) { if (is_metric_event(event)) cpuc->n_metric--; } static int collect_event(struct cpu_hw_events *cpuc, struct perf_event *event, int max_count, int n) { union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap); if (intel_cap.perf_metrics && add_nr_metric_event(cpuc, event)) return -EINVAL; if (n >= max_count + cpuc->n_metric) return -EINVAL; cpuc->event_list[n] = event; if (is_counter_pair(&event->hw)) { cpuc->n_pair++; cpuc->n_txn_pair++; } return 0; } /* * dogrp: true if must collect siblings events (group) * returns total number of events and error code */ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp) { struct perf_event *event; int n, max_count; max_count = x86_pmu_num_counters(cpuc->pmu) + x86_pmu_num_counters_fixed(cpuc->pmu); /* current number of events already accepted */ n = cpuc->n_events; if (!cpuc->n_events) cpuc->pebs_output = 0; if (!cpuc->is_fake && leader->attr.precise_ip) { /* * For PEBS->PT, if !aux_event, the group leader (PT) went * away, the group was broken down and this singleton event * can't schedule any more. */ if (is_pebs_pt(leader) && !leader->aux_event) return -EINVAL; /* * pebs_output: 0: no PEBS so far, 1: PT, 2: DS */ if (cpuc->pebs_output && cpuc->pebs_output != is_pebs_pt(leader) + 1) return -EINVAL; cpuc->pebs_output = is_pebs_pt(leader) + 1; } if (is_x86_event(leader)) { if (collect_event(cpuc, leader, max_count, n)) return -EINVAL; n++; } if (!dogrp) return n; for_each_sibling_event(event, leader) { if (!is_x86_event(event) || event->state <= PERF_EVENT_STATE_OFF) continue; if (collect_event(cpuc, event, max_count, n)) return -EINVAL; n++; } return n; } static inline void x86_assign_hw_event(struct perf_event *event, struct cpu_hw_events *cpuc, int i) { struct hw_perf_event *hwc = &event->hw; int idx; idx = hwc->idx = cpuc->assign[i]; hwc->last_cpu = smp_processor_id(); hwc->last_tag = ++cpuc->tags[i]; static_call_cond(x86_pmu_assign)(event, idx); switch (hwc->idx) { case INTEL_PMC_IDX_FIXED_BTS: case INTEL_PMC_IDX_FIXED_VLBR: hwc->config_base = 0; hwc->event_base = 0; break; case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: /* All the metric events are mapped onto the fixed counter 3. */ idx = INTEL_PMC_IDX_FIXED_SLOTS; fallthrough; case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS-1: hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; hwc->event_base = x86_pmu_fixed_ctr_addr(idx - INTEL_PMC_IDX_FIXED); hwc->event_base_rdpmc = (idx - INTEL_PMC_IDX_FIXED) | INTEL_PMC_FIXED_RDPMC_BASE; break; default: hwc->config_base = x86_pmu_config_addr(hwc->idx); hwc->event_base = x86_pmu_event_addr(hwc->idx); hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx); break; } } /** * x86_perf_rdpmc_index - Return PMC counter used for event * @event: the perf_event to which the PMC counter was assigned * * The counter assigned to this performance event may change if interrupts * are enabled. This counter should thus never be used while interrupts are * enabled. Before this function is used to obtain the assigned counter the * event should be checked for validity using, for example, * perf_event_read_local(), within the same interrupt disabled section in * which this counter is planned to be used. * * Return: The index of the performance monitoring counter assigned to * @perf_event. */ int x86_perf_rdpmc_index(struct perf_event *event) { lockdep_assert_irqs_disabled(); return event->hw.event_base_rdpmc; } static inline int match_prev_assignment(struct hw_perf_event *hwc, struct cpu_hw_events *cpuc, int i) { return hwc->idx == cpuc->assign[i] && hwc->last_cpu == smp_processor_id() && hwc->last_tag == cpuc->tags[i]; } static void x86_pmu_start(struct perf_event *event, int flags); static void x86_pmu_enable(struct pmu *pmu) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_event *event; struct hw_perf_event *hwc; int i, added = cpuc->n_added; if (!x86_pmu_initialized()) return; if (cpuc->enabled) return; if (cpuc->n_added) { int n_running = cpuc->n_events - cpuc->n_added; /* * The late setup (after counters are scheduled) * is required for some cases, e.g., PEBS counters * snapshotting. Because an accurate counter index * is needed. */ static_call_cond(x86_pmu_late_setup)(); /* * apply assignment obtained either from * hw_perf_group_sched_in() or x86_pmu_enable() * * step1: save events moving to new counters */ for (i = 0; i < n_running; i++) { event = cpuc->event_list[i]; hwc = &event->hw; /* * we can avoid reprogramming counter if: * - assigned same counter as last time * - running on same CPU as last time * - no other event has used the counter since */ if (hwc->idx == -1 || match_prev_assignment(hwc, cpuc, i)) continue; /* * Ensure we don't accidentally enable a stopped * counter simply because we rescheduled. */ if (hwc->state & PERF_HES_STOPPED) hwc->state |= PERF_HES_ARCH; x86_pmu_stop(event, PERF_EF_UPDATE); cpuc->events[hwc->idx] = NULL; } /* * step2: reprogram moved events into new counters */ for (i = 0; i < cpuc->n_events; i++) { event = cpuc->event_list[i]; hwc = &event->hw; if (!match_prev_assignment(hwc, cpuc, i)) x86_assign_hw_event(event, cpuc, i); else if (i < n_running) continue; if (hwc->state & PERF_HES_ARCH) continue; /* * if cpuc->enabled = 0, then no wrmsr as * per x86_pmu_enable_event() */ cpuc->events[hwc->idx] = event; x86_pmu_start(event, PERF_EF_RELOAD); } cpuc->n_added = 0; perf_events_lapic_init(); } cpuc->enabled = 1; barrier(); static_call(x86_pmu_enable_all)(added); } DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); /* * Set the next IRQ period, based on the hwc->period_left value. * To be called with the event disabled in hw: */ int x86_perf_event_set_period(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; s64 left = local64_read(&hwc->period_left); s64 period = hwc->sample_period; int ret = 0, idx = hwc->idx; if (unlikely(!hwc->event_base)) return 0; /* * If we are way outside a reasonable range then just skip forward: */ if (unlikely(left <= -period)) { left = period; local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } if (unlikely(left <= 0)) { left += period; local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } /* * Quirk: certain CPUs dont like it if just 1 hw_event is left: */ if (unlikely(left < 2)) left = 2; if (left > x86_pmu.max_period) left = x86_pmu.max_period; static_call_cond(x86_pmu_limit_period)(event, &left); this_cpu_write(pmc_prev_left[idx], left); /* * The hw event starts counting from this event offset, * mark it to be able to extra future deltas: */ local64_set(&hwc->prev_count, (u64)-left); wrmsrq(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); /* * Sign extend the Merge event counter's upper 16 bits since * we currently declare a 48-bit counter width */ if (is_counter_pair(hwc)) wrmsrq(x86_pmu_event_addr(idx + 1), 0xffff); perf_event_update_userpage(event); return ret; } void x86_pmu_enable_event(struct perf_event *event) { if (__this_cpu_read(cpu_hw_events.enabled)) __x86_pmu_enable_event(&event->hw, ARCH_PERFMON_EVENTSEL_ENABLE); } /* * Add a single event to the PMU. * * The event is added to the group of enabled events * but only if it can be scheduled with existing events. */ static int x86_pmu_add(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc; int assign[X86_PMC_IDX_MAX]; int n, n0, ret; hwc = &event->hw; n0 = cpuc->n_events; ret = n = collect_events(cpuc, event, false); if (ret < 0) goto out; hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; if (!(flags & PERF_EF_START)) hwc->state |= PERF_HES_ARCH; /* * If group events scheduling transaction was started, * skip the schedulability test here, it will be performed * at commit time (->commit_txn) as a whole. * * If commit fails, we'll call ->del() on all events * for which ->add() was called. */ if (cpuc->txn_flags & PERF_PMU_TXN_ADD) goto done_collect; ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign); if (ret) goto out; /* * copy new assignment, now we know it is possible * will be used by hw_perf_enable() */ memcpy(cpuc->assign, assign, n*sizeof(int)); done_collect: /* * Commit the collect_events() state. See x86_pmu_del() and * x86_pmu_*_txn(). */ cpuc->n_events = n; cpuc->n_added += n - n0; cpuc->n_txn += n - n0; /* * This is before x86_pmu_enable() will call x86_pmu_start(), * so we enable LBRs before an event needs them etc.. */ static_call_cond(x86_pmu_add)(event); ret = 0; out: return ret; } static void x86_pmu_start(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx = event->hw.idx; if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) return; if (WARN_ON_ONCE(idx == -1)) return; if (flags & PERF_EF_RELOAD) { WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); static_call(x86_pmu_set_period)(event); } event->hw.state = 0; __set_bit(idx, cpuc->active_mask); static_call(x86_pmu_enable)(event); perf_event_update_userpage(event); } void perf_event_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; unsigned long *cntr_mask, *fixed_cntr_mask; struct event_constraint *pebs_constraints; struct cpu_hw_events *cpuc; u64 pebs, debugctl; int cpu, idx; guard(irqsave)(); cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_events, cpu); cntr_mask = hybrid(cpuc->pmu, cntr_mask); fixed_cntr_mask = hybrid(cpuc->pmu, fixed_cntr_mask); pebs_constraints = hybrid(cpuc->pmu, pebs_constraints); if (!*(u64 *)cntr_mask) return; if (x86_pmu.version >= 2) { rdmsrq(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); rdmsrq(MSR_CORE_PERF_GLOBAL_STATUS, status); rdmsrq(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); rdmsrq(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); pr_info("\n"); pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); pr_info("CPU#%d: status: %016llx\n", cpu, status); pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); if (pebs_constraints) { rdmsrq(MSR_IA32_PEBS_ENABLE, pebs); pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs); } if (x86_pmu.lbr_nr) { rdmsrq(MSR_IA32_DEBUGCTLMSR, debugctl); pr_info("CPU#%d: debugctl: %016llx\n", cpu, debugctl); } } pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); for_each_set_bit(idx, cntr_mask, X86_PMC_IDX_MAX) { rdmsrq(x86_pmu_config_addr(idx), pmc_ctrl); rdmsrq(x86_pmu_event_addr(idx), pmc_count); prev_left = per_cpu(pmc_prev_left[idx], cpu); pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", cpu, idx, pmc_ctrl); pr_info("CPU#%d: gen-PMC%d count: %016llx\n", cpu, idx, pmc_count); pr_info("CPU#%d: gen-PMC%d left: %016llx\n", cpu, idx, prev_left); } for_each_set_bit(idx, fixed_cntr_mask, X86_PMC_IDX_MAX) { if (fixed_counter_disabled(idx, cpuc->pmu)) continue; rdmsrq(x86_pmu_fixed_ctr_addr(idx), pmc_count); pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", cpu, idx, pmc_count); } } void x86_pmu_stop(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; if (test_bit(hwc->idx, cpuc->active_mask)) { static_call(x86_pmu_disable)(event); __clear_bit(hwc->idx, cpuc->active_mask); WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); hwc->state |= PERF_HES_STOPPED; } if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { /* * Drain the remaining delta count out of a event * that we are disabling: */ static_call(x86_pmu_update)(event); hwc->state |= PERF_HES_UPTODATE; } } static void x86_pmu_del(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap); int i; /* * If we're called during a txn, we only need to undo x86_pmu.add. * The events never got scheduled and ->cancel_txn will truncate * the event_list. * * XXX assumes any ->del() called during a TXN will only be on * an event added during that same TXN. */ if (cpuc->txn_flags & PERF_PMU_TXN_ADD) goto do_del; __set_bit(event->hw.idx, cpuc->dirty); /* * Not a TXN, therefore cleanup properly. */ x86_pmu_stop(event, PERF_EF_UPDATE); cpuc->events[event->hw.idx] = NULL; for (i = 0; i < cpuc->n_events; i++) { if (event == cpuc->event_list[i]) break; } if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */ return; /* If we have a newly added event; make sure to decrease n_added. */ if (i >= cpuc->n_events - cpuc->n_added) --cpuc->n_added; static_call_cond(x86_pmu_put_event_constraints)(cpuc, event); /* Delete the array entry. */ while (++i < cpuc->n_events) { cpuc->event_list[i-1] = cpuc->event_list[i]; cpuc->event_constraint[i-1] = cpuc->event_constraint[i]; cpuc->assign[i-1] = cpuc->assign[i]; } cpuc->event_constraint[i-1] = NULL; --cpuc->n_events; if (intel_cap.perf_metrics) del_nr_metric_event(cpuc, event); perf_event_update_userpage(event); do_del: /* * This is after x86_pmu_stop(); so we disable LBRs after any * event can need them etc.. */ static_call_cond(x86_pmu_del)(event); } int x86_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; struct cpu_hw_events *cpuc; struct perf_event *event; int idx, handled = 0; u64 last_period; u64 val; cpuc = this_cpu_ptr(&cpu_hw_events); /* * Some chipsets need to unmask the LVTPC in a particular spot * inside the nmi handler. As a result, the unmasking was pushed * into all the nmi handlers. * * This generic handler doesn't seem to have any issues where the * unmasking occurs so it was left at the top. */ apic_write(APIC_LVTPC, APIC_DM_NMI); for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { if (!test_bit(idx, cpuc->active_mask)) continue; event = cpuc->events[idx]; last_period = event->hw.last_period; val = static_call(x86_pmu_update)(event); if (val & (1ULL << (x86_pmu.cntval_bits - 1))) continue; /* * event overflow */ handled++; if (!static_call(x86_pmu_set_period)(event)) continue; perf_sample_data_init(&data, 0, last_period); perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); perf_event_overflow(event, &data, regs); } if (handled) inc_irq_stat(apic_perf_irqs); return handled; } void perf_events_lapic_init(void) { if (!x86_pmu.apic || !x86_pmu_initialized()) return; /* * Always use NMI for PMU */ apic_write(APIC_LVTPC, APIC_DM_NMI); } #ifdef CONFIG_PERF_GUEST_MEDIATED_PMU void perf_load_guest_lvtpc(u32 guest_lvtpc) { u32 masked = guest_lvtpc & APIC_LVT_MASKED; apic_write(APIC_LVTPC, APIC_DM_FIXED | PERF_GUEST_MEDIATED_PMI_VECTOR | masked); this_cpu_write(guest_lvtpc_loaded, true); } EXPORT_SYMBOL_FOR_KVM(perf_load_guest_lvtpc); void perf_put_guest_lvtpc(void) { this_cpu_write(guest_lvtpc_loaded, false); apic_write(APIC_LVTPC, APIC_DM_NMI); } EXPORT_SYMBOL_FOR_KVM(perf_put_guest_lvtpc); #endif /* CONFIG_PERF_GUEST_MEDIATED_PMU */ static int perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) { u64 start_clock; u64 finish_clock; int ret; /* * Ignore all NMIs when the CPU's LVTPC is configured to route PMIs to * PERF_GUEST_MEDIATED_PMI_VECTOR, i.e. when an NMI time can't be due * to a PMI. Attempting to handle a PMI while the guest's context is * loaded will generate false positives and clobber guest state. Note, * the LVTPC is switched to/from the dedicated mediated PMI IRQ vector * while host events are quiesced. */ if (this_cpu_read(guest_lvtpc_loaded)) return NMI_DONE; /* * All PMUs/events that share this PMI handler should make sure to * increment active_events for their events. */ if (!atomic_read(&active_events)) return NMI_DONE; start_clock = sched_clock(); ret = static_call(x86_pmu_handle_irq)(regs); finish_clock = sched_clock(); perf_sample_event_took(finish_clock - start_clock); return ret; } NOKPROBE_SYMBOL(perf_event_nmi_handler); struct event_constraint emptyconstraint; struct event_constraint unconstrained; static int x86_pmu_prepare_cpu(unsigned int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); int i; for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) cpuc->kfree_on_online[i] = NULL; if (x86_pmu.cpu_prepare) return x86_pmu.cpu_prepare(cpu); return 0; } static int x86_pmu_dead_cpu(unsigned int cpu) { if (x86_pmu.cpu_dead) x86_pmu.cpu_dead(cpu); return 0; } static int x86_pmu_online_cpu(unsigned int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); int i; for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) { kfree(cpuc->kfree_on_online[i]); cpuc->kfree_on_online[i] = NULL; } return 0; } static int x86_pmu_starting_cpu(unsigned int cpu) { if (x86_pmu.cpu_starting) x86_pmu.cpu_starting(cpu); return 0; } static int x86_pmu_dying_cpu(unsigned int cpu) { if (x86_pmu.cpu_dying) x86_pmu.cpu_dying(cpu); return 0; } static void __init pmu_check_apic(void) { if (boot_cpu_has(X86_FEATURE_APIC)) return; x86_pmu.apic = 0; pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); pr_info("no hardware sampling interrupt available.\n"); /* * If we have a PMU initialized but no APIC * interrupts, we cannot sample hardware * events (user-space has to fall back and * sample via a hrtimer based software event): */ pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; } static struct attribute_group x86_pmu_format_group __ro_after_init = { .name = "format", .attrs = NULL, }; ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) { struct perf_pmu_events_attr *pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr); u64 config = 0; if (pmu_attr->id < x86_pmu.max_events) config = x86_pmu.event_map(pmu_attr->id); /* string trumps id */ if (pmu_attr->event_str) return sprintf(page, "%s\n", pmu_attr->event_str); return x86_pmu.events_sysfs_show(page, config); } EXPORT_SYMBOL_GPL(events_sysfs_show); ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) { struct perf_pmu_events_ht_attr *pmu_attr = container_of(attr, struct perf_pmu_events_ht_attr, attr); /* * Report conditional events depending on Hyper-Threading. * * This is overly conservative as usually the HT special * handling is not needed if the other CPU thread is idle. * * Note this does not (and cannot) handle the case when thread * siblings are invisible, for example with virtualization * if they are owned by some other guest. The user tool * has to re-read when a thread sibling gets onlined later. */ return sprintf(page, "%s", topology_max_smt_threads() > 1 ? pmu_attr->event_str_ht : pmu_attr->event_str_noht); } ssize_t events_hybrid_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) { struct perf_pmu_events_hybrid_attr *pmu_attr = container_of(attr, struct perf_pmu_events_hybrid_attr, attr); struct x86_hybrid_pmu *pmu; const char *str, *next_str; int i; if (hweight64(pmu_attr->pmu_type) == 1) return sprintf(page, "%s", pmu_attr->event_str); /* * Hybrid PMUs may support the same event name, but with different * event encoding, e.g., the mem-loads event on an Atom PMU has * different event encoding from a Core PMU. * * The event_str includes all event encodings. Each event encoding * is divided by ";". The order of the event encodings must follow * the order of the hybrid PMU index. */ pmu = container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu); str = pmu_attr->event_str; for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { if (!(x86_pmu.hybrid_pmu[i].pmu_type & pmu_attr->pmu_type)) continue; if (x86_pmu.hybrid_pmu[i].pmu_type & pmu->pmu_type) { next_str = strchr(str, ';'); if (next_str) return snprintf(page, next_str - str + 1, "%s", str); else return sprintf(page, "%s", str); } str = strchr(str, ';'); str++; } return 0; } EXPORT_SYMBOL_GPL(events_hybrid_sysfs_show); EVENT_ATTR(cpu-cycles, CPU_CYCLES ); EVENT_ATTR(instructions, INSTRUCTIONS ); EVENT_ATTR(cache-references, CACHE_REFERENCES ); EVENT_ATTR(cache-misses, CACHE_MISSES ); EVENT_ATTR(branch-instructions, BRANCH_INSTRUCTIONS ); EVENT_ATTR(branch-misses, BRANCH_MISSES ); EVENT_ATTR(bus-cycles, BUS_CYCLES ); EVENT_ATTR(stalled-cycles-frontend, STALLED_CYCLES_FRONTEND ); EVENT_ATTR(stalled-cycles-backend, STALLED_CYCLES_BACKEND ); EVENT_ATTR(ref-cycles, REF_CPU_CYCLES ); static struct attribute *empty_attrs; static struct attribute *events_attr[] = { EVENT_PTR(CPU_CYCLES), EVENT_PTR(INSTRUCTIONS), EVENT_PTR(CACHE_REFERENCES), EVENT_PTR(CACHE_MISSES), EVENT_PTR(BRANCH_INSTRUCTIONS), EVENT_PTR(BRANCH_MISSES), EVENT_PTR(BUS_CYCLES), EVENT_PTR(STALLED_CYCLES_FRONTEND), EVENT_PTR(STALLED_CYCLES_BACKEND), EVENT_PTR(REF_CPU_CYCLES), NULL, }; /* * Remove all undefined events (x86_pmu.event_map(id) == 0) * out of events_attr attributes. */ static umode_t is_visible(struct kobject *kobj, struct attribute *attr, int idx) { struct perf_pmu_events_attr *pmu_attr; if (idx >= x86_pmu.max_events) return 0; pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr.attr); /* str trumps id */ return pmu_attr->event_str || x86_pmu.event_map(idx) ? attr->mode : 0; } static struct attribute_group x86_pmu_events_group __ro_after_init = { .name = "events", .attrs = events_attr, .is_visible = is_visible, }; ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event) { u64 umask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; u64 cmask = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24; bool edge = (config & ARCH_PERFMON_EVENTSEL_EDGE); bool pc = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL); bool any = (config & ARCH_PERFMON_EVENTSEL_ANY); bool inv = (config & ARCH_PERFMON_EVENTSEL_INV); ssize_t ret; /* * We have whole page size to spend and just little data * to write, so we can safely use sprintf. */ ret = sprintf(page, "event=0x%02llx", event); if (umask) ret += sprintf(page + ret, ",umask=0x%02llx", umask); if (edge) ret += sprintf(page + ret, ",edge"); if (pc) ret += sprintf(page + ret, ",pc"); if (any) ret += sprintf(page + ret, ",any"); if (inv) ret += sprintf(page + ret, ",inv"); if (cmask) ret += sprintf(page + ret, ",cmask=0x%02llx", cmask); ret += sprintf(page + ret, "\n"); return ret; } static struct attribute_group x86_pmu_attr_group; static struct attribute_group x86_pmu_caps_group; static void x86_pmu_static_call_update(void) { static_call_update(x86_pmu_handle_irq, x86_pmu.handle_irq); static_call_update(x86_pmu_disable_all, x86_pmu.disable_all); static_call_update(x86_pmu_enable_all, x86_pmu.enable_all); static_call_update(x86_pmu_enable, x86_pmu.enable); static_call_update(x86_pmu_disable, x86_pmu.disable); static_call_update(x86_pmu_assign, x86_pmu.assign); static_call_update(x86_pmu_add, x86_pmu.add); static_call_update(x86_pmu_del, x86_pmu.del); static_call_update(x86_pmu_read, x86_pmu.read); static_call_update(x86_pmu_set_period, x86_pmu.set_period); static_call_update(x86_pmu_update, x86_pmu.update); static_call_update(x86_pmu_limit_period, x86_pmu.limit_period); static_call_update(x86_pmu_schedule_events, x86_pmu.schedule_events); static_call_update(x86_pmu_get_event_constraints, x86_pmu.get_event_constraints); static_call_update(x86_pmu_put_event_constraints, x86_pmu.put_event_constraints); static_call_update(x86_pmu_start_scheduling, x86_pmu.start_scheduling); static_call_update(x86_pmu_commit_scheduling, x86_pmu.commit_scheduling); static_call_update(x86_pmu_stop_scheduling, x86_pmu.stop_scheduling); static_call_update(x86_pmu_sched_task, x86_pmu.sched_task); static_call_update(x86_pmu_drain_pebs, x86_pmu.drain_pebs); static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases); static_call_update(x86_pmu_guest_get_msrs, x86_pmu.guest_get_msrs); static_call_update(x86_pmu_filter, x86_pmu.filter); static_call_update(x86_pmu_late_setup, x86_pmu.late_setup); static_call_update(x86_pmu_pebs_enable, x86_pmu.pebs_enable); static_call_update(x86_pmu_pebs_disable, x86_pmu.pebs_disable); static_call_update(x86_pmu_pebs_enable_all, x86_pmu.pebs_enable_all); static_call_update(x86_pmu_pebs_disable_all, x86_pmu.pebs_disable_all); } static void _x86_pmu_read(struct perf_event *event) { static_call(x86_pmu_update)(event); } void x86_pmu_show_pmu_cap(struct pmu *pmu) { pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.cntval_bits); pr_info("... generic counters: %d\n", x86_pmu_num_counters(pmu)); pr_info("... generic bitmap: %016llx\n", hybrid(pmu, cntr_mask64)); pr_info("... fixed-purpose counters: %d\n", x86_pmu_num_counters_fixed(pmu)); pr_info("... fixed-purpose bitmap: %016llx\n", hybrid(pmu, fixed_cntr_mask64)); pr_info("... value mask: %016llx\n", x86_pmu.cntval_mask); pr_info("... max period: %016llx\n", x86_pmu.max_period); pr_info("... global_ctrl mask: %016llx\n", hybrid(pmu, intel_ctrl)); } static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; int err; pr_info("Performance Events: "); switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: err = intel_pmu_init(); break; case X86_VENDOR_AMD: err = amd_pmu_init(); break; case X86_VENDOR_HYGON: err = amd_pmu_init(); x86_pmu.name = "HYGON"; break; case X86_VENDOR_ZHAOXIN: case X86_VENDOR_CENTAUR: err = zhaoxin_pmu_init(); break; default: err = -ENOTSUPP; } if (err != 0) { pr_cont("no PMU driver, software events only.\n"); err = 0; goto out_bad_pmu; } pmu_check_apic(); /* sanity check that the hardware exists or is emulated */ if (!check_hw_exists(&pmu, x86_pmu.cntr_mask, x86_pmu.fixed_cntr_mask)) goto out_bad_pmu; pr_cont("%s PMU driver.\n", x86_pmu.name); /* enable userspace RDPMC usage by default */ x86_pmu.attr_rdpmc = X86_USER_RDPMC_CONDITIONAL_ENABLE; for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) quirk->func(); if (!x86_pmu.intel_ctrl) x86_pmu.intel_ctrl = x86_pmu.cntr_mask64; if (!x86_pmu.config_mask) x86_pmu.config_mask = X86_RAW_EVENT_MASK; perf_events_lapic_init(); register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI"); unconstrained = (struct event_constraint) __EVENT_CONSTRAINT(0, x86_pmu.cntr_mask64, 0, x86_pmu_num_counters(NULL), 0, 0); x86_pmu_format_group.attrs = x86_pmu.format_attrs; if (!x86_pmu.events_sysfs_show) x86_pmu_events_group.attrs = &empty_attrs; pmu.attr_update = x86_pmu.attr_update; if (!is_hybrid()) x86_pmu_show_pmu_cap(NULL); if (!x86_pmu.read) x86_pmu.read = _x86_pmu_read; if (!x86_pmu.guest_get_msrs) x86_pmu.guest_get_msrs = (void *)&__static_call_return0; if (!x86_pmu.set_period) x86_pmu.set_period = x86_perf_event_set_period; if (!x86_pmu.update) x86_pmu.update = x86_perf_event_update; x86_pmu_static_call_update(); /* * Install callbacks. Core will call them for each online * cpu. */ err = cpuhp_setup_state(CPUHP_PERF_X86_PREPARE, "perf/x86:prepare", x86_pmu_prepare_cpu, x86_pmu_dead_cpu); if (err) return err; err = cpuhp_setup_state(CPUHP_AP_PERF_X86_STARTING, "perf/x86:starting", x86_pmu_starting_cpu, x86_pmu_dying_cpu); if (err) goto out; err = cpuhp_setup_state(CPUHP_AP_PERF_X86_ONLINE, "perf/x86:online", x86_pmu_online_cpu, NULL); if (err) goto out1; if (!is_hybrid()) { err = perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); if (err) goto out2; } else { struct x86_hybrid_pmu *hybrid_pmu; int i, j; for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { hybrid_pmu = &x86_pmu.hybrid_pmu[i]; hybrid_pmu->pmu = pmu; hybrid_pmu->pmu.type = -1; hybrid_pmu->pmu.attr_update = x86_pmu.attr_update; hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_EXTENDED_HW_TYPE; err = perf_pmu_register(&hybrid_pmu->pmu, hybrid_pmu->name, (hybrid_pmu->pmu_type == hybrid_big) ? PERF_TYPE_RAW : -1); if (err) break; } if (i < x86_pmu.num_hybrid_pmus) { for (j = 0; j < i; j++) perf_pmu_unregister(&x86_pmu.hybrid_pmu[j].pmu); pr_warn("Failed to register hybrid PMUs\n"); kfree(x86_pmu.hybrid_pmu); x86_pmu.hybrid_pmu = NULL; x86_pmu.num_hybrid_pmus = 0; goto out2; } } return 0; out2: cpuhp_remove_state(CPUHP_AP_PERF_X86_ONLINE); out1: cpuhp_remove_state(CPUHP_AP_PERF_X86_STARTING); out: cpuhp_remove_state(CPUHP_PERF_X86_PREPARE); out_bad_pmu: memset(&x86_pmu, 0, sizeof(x86_pmu)); return err; } early_initcall(init_hw_perf_events); static void x86_pmu_read(struct perf_event *event) { static_call(x86_pmu_read)(event); } /* * Start group events scheduling transaction * Set the flag to make pmu::enable() not perform the * schedulability test, it will be performed at commit time * * We only support PERF_PMU_TXN_ADD transactions. Save the * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD * transactions. */ static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); WARN_ON_ONCE(cpuc->txn_flags); /* txn already in flight */ cpuc->txn_flags = txn_flags; if (txn_flags & ~PERF_PMU_TXN_ADD) return; perf_pmu_disable(pmu); __this_cpu_write(cpu_hw_events.n_txn, 0); __this_cpu_write(cpu_hw_events.n_txn_pair, 0); __this_cpu_write(cpu_hw_events.n_txn_metric, 0); } /* * Stop group events scheduling transaction * Clear the flag and pmu::enable() will perform the * schedulability test. */ static void x86_pmu_cancel_txn(struct pmu *pmu) { unsigned int txn_flags; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */ txn_flags = cpuc->txn_flags; cpuc->txn_flags = 0; if (txn_flags & ~PERF_PMU_TXN_ADD) return; /* * Truncate collected array by the number of events added in this * transaction. See x86_pmu_add() and x86_pmu_*_txn(). */ __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn)); __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn)); __this_cpu_sub(cpu_hw_events.n_pair, __this_cpu_read(cpu_hw_events.n_txn_pair)); __this_cpu_sub(cpu_hw_events.n_metric, __this_cpu_read(cpu_hw_events.n_txn_metric)); perf_pmu_enable(pmu); } /* * Commit group events scheduling transaction * Perform the group schedulability test as a whole * Return 0 if success * * Does not cancel the transaction on failure; expects the caller to do this. */ static int x86_pmu_commit_txn(struct pmu *pmu) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int assign[X86_PMC_IDX_MAX]; int n, ret; WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */ if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) { cpuc->txn_flags = 0; return 0; } n = cpuc->n_events; if (!x86_pmu_initialized()) return -EAGAIN; ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign); if (ret) return ret; /* * copy new assignment, now we know it is possible * will be used by hw_perf_enable() */ memcpy(cpuc->assign, assign, n*sizeof(int)); cpuc->txn_flags = 0; perf_pmu_enable(pmu); return 0; } /* * a fake_cpuc is used to validate event groups. Due to * the extra reg logic, we need to also allocate a fake * per_core and per_cpu structure. Otherwise, group events * using extra reg may conflict without the kernel being * able to catch this when the last event gets added to * the group. */ static void free_fake_cpuc(struct cpu_hw_events *cpuc) { intel_cpuc_finish(cpuc); kfree(cpuc); } static struct cpu_hw_events *allocate_fake_cpuc(struct pmu *event_pmu) { struct cpu_hw_events *cpuc; int cpu; cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL); if (!cpuc) return ERR_PTR(-ENOMEM); cpuc->is_fake = 1; if (is_hybrid()) { struct x86_hybrid_pmu *h_pmu; h_pmu = hybrid_pmu(event_pmu); if (cpumask_empty(&h_pmu->supported_cpus)) goto error; cpu = cpumask_first(&h_pmu->supported_cpus); } else cpu = raw_smp_processor_id(); cpuc->pmu = event_pmu; if (intel_cpuc_prepare(cpuc, cpu)) goto error; return cpuc; error: free_fake_cpuc(cpuc); return ERR_PTR(-ENOMEM); } /* * validate that we can schedule this event */ static int validate_event(struct perf_event *event) { struct cpu_hw_events *fake_cpuc; struct event_constraint *c; int ret = 0; fake_cpuc = allocate_fake_cpuc(event->pmu); if (IS_ERR(fake_cpuc)) return PTR_ERR(fake_cpuc); c = x86_pmu.get_event_constraints(fake_cpuc, 0, event); if (!c || !c->weight) ret = -EINVAL; if (x86_pmu.put_event_constraints) x86_pmu.put_event_constraints(fake_cpuc, event); free_fake_cpuc(fake_cpuc); return ret; } /* * validate a single event group * * validation include: * - check events are compatible which each other * - events do not compete for the same counter * - number of events <= number of counters * * validation ensures the group can be loaded onto the * PMU if it was the only group available. */ static int validate_group(struct perf_event *event) { struct perf_event *leader = event->group_leader; struct cpu_hw_events *fake_cpuc; int ret = -EINVAL, n; /* * Reject events from different hybrid PMUs. */ if (is_hybrid()) { struct perf_event *sibling; struct pmu *pmu = NULL; if (is_x86_event(leader)) pmu = leader->pmu; for_each_sibling_event(sibling, leader) { if (!is_x86_event(sibling)) continue; if (!pmu) pmu = sibling->pmu; else if (pmu != sibling->pmu) return ret; } } fake_cpuc = allocate_fake_cpuc(event->pmu); if (IS_ERR(fake_cpuc)) return PTR_ERR(fake_cpuc); /* * the event is not yet connected with its * siblings therefore we must first collect * existing siblings, then add the new event * before we can simulate the scheduling */ n = collect_events(fake_cpuc, leader, true); if (n < 0) goto out; fake_cpuc->n_events = n; n = collect_events(fake_cpuc, event, false); if (n < 0) goto out; fake_cpuc->n_events = 0; ret = x86_pmu.schedule_events(fake_cpuc, n, NULL); out: free_fake_cpuc(fake_cpuc); return ret; } static int x86_pmu_event_init(struct perf_event *event) { struct x86_hybrid_pmu *pmu = NULL; int err; if ((event->attr.type != event->pmu->type) && (event->attr.type != PERF_TYPE_HARDWARE) && (event->attr.type != PERF_TYPE_HW_CACHE)) return -ENOENT; if (is_hybrid() && (event->cpu != -1)) { pmu = hybrid_pmu(event->pmu); if (!cpumask_test_cpu(event->cpu, &pmu->supported_cpus)) return -ENOENT; } err = __x86_pmu_event_init(event); if (!err) { if (event->group_leader != event) err = validate_group(event); else err = validate_event(event); } if (err) { if (event->destroy) event->destroy(event); event->destroy = NULL; } if (READ_ONCE(x86_pmu.attr_rdpmc) && !(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS)) event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT; return err; } void perf_clear_dirty_counters(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int i; /* Don't need to clear the assigned counter. */ for (i = 0; i < cpuc->n_events; i++) __clear_bit(cpuc->assign[i], cpuc->dirty); if (bitmap_empty(cpuc->dirty, X86_PMC_IDX_MAX)) return; for_each_set_bit(i, cpuc->dirty, X86_PMC_IDX_MAX) { if (i >= INTEL_PMC_IDX_FIXED) { /* Metrics and fake events don't have corresponding HW counters. */ if (!test_bit(i - INTEL_PMC_IDX_FIXED, hybrid(cpuc->pmu, fixed_cntr_mask))) continue; wrmsrq(x86_pmu_fixed_ctr_addr(i - INTEL_PMC_IDX_FIXED), 0); } else { wrmsrq(x86_pmu_event_addr(i), 0); } } bitmap_zero(cpuc->dirty, X86_PMC_IDX_MAX); } static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) { if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)) return; /* * This function relies on not being called concurrently in two * tasks in the same mm. Otherwise one task could observe * perf_rdpmc_allowed > 1 and return all the way back to * userspace with CR4.PCE clear while another task is still * doing on_each_cpu_mask() to propagate CR4.PCE. * * For now, this can't happen because all callers hold mmap_lock * for write. If this changes, we'll need a different solution. */ mmap_assert_write_locked(mm); if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1) on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1); } static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm) { if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)) return; if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed)) on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1); } static int x86_pmu_event_idx(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; if (!(hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT)) return 0; if (is_metric_idx(hwc->idx)) return INTEL_PMC_FIXED_RDPMC_METRICS + 1; else return hwc->event_base_rdpmc + 1; } static ssize_t get_attr_rdpmc(struct device *cdev, struct device_attribute *attr, char *buf) { return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc); } /* * Behaviors of rdpmc value: * - rdpmc = 0 * global user space rdpmc and counter level's user space rdpmc of all * counters are both disabled. * - rdpmc = 1 * global user space rdpmc is enabled in mmap enabled time window and * counter level's user space rdpmc is enabled for only non system-wide * events. Counter level's user space rdpmc of system-wide events is * still disabled by default. This won't introduce counter data leak for * non system-wide events since their count data would be cleared when * context switches. * - rdpmc = 2 * global user space rdpmc and counter level's user space rdpmc of all * counters are enabled unconditionally. * * Suppose the rdpmc value won't be changed frequently, don't dynamically * reschedule events to make the new rpdmc value take effect on active perf * events immediately, the new rdpmc value would only impact the new * activated perf events. This makes code simpler and cleaner. */ static ssize_t set_attr_rdpmc(struct device *cdev, struct device_attribute *attr, const char *buf, size_t count) { static DEFINE_MUTEX(rdpmc_mutex); unsigned long val; ssize_t ret; ret = kstrtoul(buf, 0, &val); if (ret) return ret; if (val > 2) return -EINVAL; if (x86_pmu.attr_rdpmc_broken) return -ENOTSUPP; guard(mutex)(&rdpmc_mutex); if (val != x86_pmu.attr_rdpmc) { /* * Changing into or out of never available or always available, * aka perf-event-bypassing mode. This path is extremely slow, * but only root can trigger it, so it's okay. */ if (val == 0) static_branch_inc(&rdpmc_never_available_key); else if (x86_pmu.attr_rdpmc == X86_USER_RDPMC_NEVER_ENABLE) static_branch_dec(&rdpmc_never_available_key); if (val == 2) static_branch_inc(&rdpmc_always_available_key); else if (x86_pmu.attr_rdpmc == X86_USER_RDPMC_ALWAYS_ENABLE) static_branch_dec(&rdpmc_always_available_key); on_each_cpu(cr4_update_pce, NULL, 1); x86_pmu.attr_rdpmc = val; } return count; } static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc); static struct attribute *x86_pmu_attrs[] = { &dev_attr_rdpmc.attr, NULL, }; static struct attribute_group x86_pmu_attr_group __ro_after_init = { .attrs = x86_pmu_attrs, }; static ssize_t max_precise_show(struct device *cdev, struct device_attribute *attr, char *buf) { struct pmu *pmu = dev_get_drvdata(cdev); return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise(pmu)); } static DEVICE_ATTR_RO(max_precise); static struct attribute *x86_pmu_caps_attrs[] = { &dev_attr_max_precise.attr, NULL }; static struct attribute_group x86_pmu_caps_group __ro_after_init = { .name = "caps", .attrs = x86_pmu_caps_attrs, }; static const struct attribute_group *x86_pmu_attr_groups[] = { &x86_pmu_attr_group, &x86_pmu_format_group, &x86_pmu_events_group, &x86_pmu_caps_group, NULL, }; static void x86_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, struct task_struct *task, bool sched_in) { static_call_cond(x86_pmu_sched_task)(pmu_ctx, task, sched_in); } void perf_check_microcode(void) { if (x86_pmu.check_microcode) x86_pmu.check_microcode(); } static int x86_pmu_check_period(struct perf_event *event, u64 value) { if (x86_pmu.check_period && x86_pmu.check_period(event, value)) return -EINVAL; if (value && x86_pmu.limit_period) { s64 left = value; x86_pmu.limit_period(event, &left); if (left > value) return -EINVAL; } return 0; } static int x86_pmu_aux_output_match(struct perf_event *event) { if (!(pmu.capabilities & PERF_PMU_CAP_AUX_OUTPUT)) return 0; if (x86_pmu.aux_output_match) return x86_pmu.aux_output_match(event); return 0; } static bool x86_pmu_filter(struct pmu *pmu, int cpu) { bool ret = false; static_call_cond(x86_pmu_filter)(pmu, cpu, &ret); return ret; } static struct pmu pmu = { .pmu_enable = x86_pmu_enable, .pmu_disable = x86_pmu_disable, .attr_groups = x86_pmu_attr_groups, .event_init = x86_pmu_event_init, .event_mapped = x86_pmu_event_mapped, .event_unmapped = x86_pmu_event_unmapped, .add = x86_pmu_add, .del = x86_pmu_del, .start = x86_pmu_start, .stop = x86_pmu_stop, .read = x86_pmu_read, .start_txn = x86_pmu_start_txn, .cancel_txn = x86_pmu_cancel_txn, .commit_txn = x86_pmu_commit_txn, .event_idx = x86_pmu_event_idx, .sched_task = x86_pmu_sched_task, .check_period = x86_pmu_check_period, .aux_output_match = x86_pmu_aux_output_match, .filter = x86_pmu_filter, }; void arch_perf_update_userpage(struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now) { struct cyc2ns_data data; u64 offset; userpg->cap_user_time = 0; userpg->cap_user_time_zero = 0; userpg->cap_user_rdpmc = !!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT); userpg->pmc_width = x86_pmu.cntval_bits; if (!using_native_sched_clock() || !sched_clock_stable()) return; cyc2ns_read_begin(&data); offset = data.cyc2ns_offset + __sched_clock_offset; /* * Internal timekeeping for enabled/running/stopped times * is always in the local_clock domain. */ userpg->cap_user_time = 1; userpg->time_mult = data.cyc2ns_mul; userpg->time_shift = data.cyc2ns_shift; userpg->time_offset = offset - now; /* * cap_user_time_zero doesn't make sense when we're using a different * time base for the records. */ if (!event->attr.use_clockid) { userpg->cap_user_time_zero = 1; userpg->time_zero = offset; } cyc2ns_read_end(); } /* * Determine whether the regs were taken from an irq/exception handler rather * than from perf_arch_fetch_caller_regs(). */ static bool perf_hw_regs(struct pt_regs *regs) { return regs->flags & X86_EFLAGS_FIXED; } void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { struct unwind_state state; unsigned long addr; if (perf_guest_state()) { /* TODO: We don't support guest os callchain now */ return; } if (perf_hw_regs(regs)) { if (perf_callchain_store(entry, regs->ip)) return; unwind_start(&state, current, regs, NULL); } else { unwind_start(&state, current, NULL, (void *)regs->sp); } for (; !unwind_done(&state); unwind_next_frame(&state)) { addr = unwind_get_return_address(&state); if (!addr || perf_callchain_store(entry, addr)) return; } } static inline int valid_user_frame(const void __user *fp, unsigned long size) { return __access_ok(fp, size); } static unsigned long get_segment_base(unsigned int segment) { struct desc_struct *desc; unsigned int idx = segment >> 3; if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) { #ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; /* * If we're not in a valid context with a real (not just lazy) * user mm, then don't even try. */ if (!nmi_uaccess_okay()) return 0; /* IRQs are off, so this synchronizes with smp_store_release */ ldt = smp_load_acquire(¤t->mm->context.ldt); if (!ldt || idx >= ldt->nr_entries) return 0; desc = &ldt->entries[idx]; #else return 0; #endif } else { if (idx >= GDT_ENTRIES) return 0; desc = raw_cpu_ptr(gdt_page.gdt) + idx; } return get_desc_base(desc); } #ifdef CONFIG_IA32_EMULATION #include <linux/compat.h> static inline int perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry) { /* 32-bit process in 64-bit kernel. */ unsigned long ss_base, cs_base; struct stack_frame_ia32 frame; const struct stack_frame_ia32 __user *fp; u32 ret_addr; if (user_64bit_mode(regs)) return 0; cs_base = get_segment_base(regs->cs); ss_base = get_segment_base(regs->ss); fp = compat_ptr(ss_base + regs->bp); pagefault_disable(); /* see perf_callchain_user() below for why we do this */ if (is_uprobe_at_func_entry(regs) && !get_user(ret_addr, (const u32 __user *)regs->sp)) perf_callchain_store(entry, ret_addr); while (entry->nr < entry->max_stack) { if (!valid_user_frame(fp, sizeof(frame))) break; if (__get_user(frame.next_frame, &fp->next_frame)) break; if (__get_user(frame.return_address, &fp->return_address)) break; perf_callchain_store(entry, cs_base + frame.return_address); fp = compat_ptr(ss_base + frame.next_frame); } pagefault_enable(); return 1; } #else static inline int perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry) { return 0; } #endif void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { struct stack_frame frame; const struct stack_frame __user *fp; unsigned long ret_addr; if (perf_guest_state()) { /* TODO: We don't support guest os callchain now */ return; } /* * We don't know what to do with VM86 stacks.. ignore them for now. */ if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM)) return; fp = (void __user *)regs->bp; perf_callchain_store(entry, regs->ip); if (!nmi_uaccess_okay()) return; if (perf_callchain_user32(regs, entry)) return; pagefault_disable(); /* * If we are called from uprobe handler, and we are indeed at the very * entry to user function (which is normally a `push %rbp` instruction, * under assumption of application being compiled with frame pointers), * we should read return address from *regs->sp before proceeding * to follow frame pointers, otherwise we'll skip immediate caller * as %rbp is not yet setup. */ if (is_uprobe_at_func_entry(regs) && !get_user(ret_addr, (const unsigned long __user *)regs->sp)) perf_callchain_store(entry, ret_addr); while (entry->nr < entry->max_stack) { if (!valid_user_frame(fp, sizeof(frame))) break; if (__get_user(frame.next_frame, &fp->next_frame)) break; if (__get_user(frame.return_address, &fp->return_address)) break; perf_callchain_store(entry, frame.return_address); fp = (void __user *)frame.next_frame; } pagefault_enable(); } /* * Deal with code segment offsets for the various execution modes: * * VM86 - the good olde 16 bit days, where the linear address is * 20 bits and we use regs->ip + 0x10 * regs->cs. * * IA32 - Where we need to look at GDT/LDT segment descriptor tables * to figure out what the 32bit base address is. * * X32 - has TIF_X32 set, but is running in x86_64 * * X86_64 - CS,DS,SS,ES are all zero based. */ static unsigned long code_segment_base(struct pt_regs *regs) { /* * For IA32 we look at the GDT/LDT segment base to convert the * effective IP to a linear address. */ #ifdef CONFIG_X86_32 /* * If we are in VM86 mode, add the segment offset to convert to a * linear address. */ if (regs->flags & X86_VM_MASK) return 0x10 * regs->cs; if (user_mode(regs) && regs->cs != __USER_CS) return get_segment_base(regs->cs); #else if (user_mode(regs) && !user_64bit_mode(regs) && regs->cs != __USER32_CS) return get_segment_base(regs->cs); #endif return 0; } unsigned long perf_arch_instruction_pointer(struct pt_regs *regs) { return regs->ip + code_segment_base(regs); } static unsigned long common_misc_flags(struct pt_regs *regs) { if (regs->flags & PERF_EFLAGS_EXACT) return PERF_RECORD_MISC_EXACT_IP; return 0; } static unsigned long guest_misc_flags(struct pt_regs *regs) { unsigned long guest_state = perf_guest_state(); if (!(guest_state & PERF_GUEST_ACTIVE)) return 0; if (guest_state & PERF_GUEST_USER) return PERF_RECORD_MISC_GUEST_USER; else return PERF_RECORD_MISC_GUEST_KERNEL; } static unsigned long host_misc_flags(struct pt_regs *regs) { if (user_mode(regs)) return PERF_RECORD_MISC_USER; else return PERF_RECORD_MISC_KERNEL; } unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs) { unsigned long flags = common_misc_flags(regs); flags |= guest_misc_flags(regs); return flags; } unsigned long perf_arch_misc_flags(struct pt_regs *regs) { unsigned long flags = common_misc_flags(regs); flags |= host_misc_flags(regs); return flags; } void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) { /* This API doesn't currently support enumerating hybrid PMUs. */ if (WARN_ON_ONCE(cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) || !x86_pmu_initialized()) { memset(cap, 0, sizeof(*cap)); return; } /* * Note, hybrid CPU models get tracked as having hybrid PMUs even when * all E-cores are disabled via BIOS. When E-cores are disabled, the * base PMU holds the correct number of counters for P-cores. */ cap->version = x86_pmu.version; cap->num_counters_gp = x86_pmu_num_counters(NULL); cap->num_counters_fixed = x86_pmu_num_counters_fixed(NULL); cap->bit_width_gp = cap->num_counters_gp ? x86_pmu.cntval_bits : 0; cap->bit_width_fixed = cap->num_counters_fixed ? x86_pmu.cntval_bits : 0; cap->events_mask = (unsigned int)x86_pmu.events_maskl; cap->events_mask_len = x86_pmu.events_mask_len; cap->pebs_ept = x86_pmu.pebs_ept; cap->mediated = !!(pmu.capabilities & PERF_PMU_CAP_MEDIATED_VPMU); } EXPORT_SYMBOL_FOR_KVM(perf_get_x86_pmu_capability); u64 perf_get_hw_event_config(int hw_event) { int max = x86_pmu.max_events; if (hw_event < max) return x86_pmu.event_map(array_index_nospec(hw_event, max)); return 0; } EXPORT_SYMBOL_FOR_KVM(perf_get_hw_event_config); |
| 375 375 374 375 375 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef __LICENSE_H #define __LICENSE_H static inline int license_is_gpl_compatible(const char *license) { return (strcmp(license, "GPL") == 0 || strcmp(license, "GPL v2") == 0 || strcmp(license, "GPL and additional rights") == 0 || strcmp(license, "Dual BSD/GPL") == 0 || strcmp(license, "Dual MIT/GPL") == 0 || strcmp(license, "Dual MPL/GPL") == 0); } #endif |
| 12 1117 1439 274 246 245 244 244 117 244 3 1 3 175 9 244 59 1424 102 53 102 35 102 19 97 5 1426 10 10 13 102 271 102 3 1428 1279 1280 147 146 1114 218 175 1012 1200 249 306 13 897 13 897 1427 272 7 272 264 264 1429 885 883 885 13 1081 271 7 1428 1425 272 7 271 1245 12 6 1426 1394 1429 885 884 13 883 1427 272 270 7 1428 70 12 70 44 70 7 69 68 70 18 1429 263 1429 1392 1581 1114 1450 1449 1450 69 36 17 1303 297 1304 298 1305 1304 77 1 76 77 1392 1397 1396 63 1390 1393 792 1220 1394 1392 1393 1394 264 1392 76 1394 1394 1392 51 792 1221 1387 1396 590 89 1 1 1 89 2005 2004 2003 2002 2002 1879 1880 1877 2006 2006 2004 19 96 5 21 102 5 102 99 100 101 102 102 102 102 71 34 670 671 669 671 669 671 671 670 669 640 515 223 518 53 205 64 54 63 18 63 63 39 39 63 52 51 21 18 18 34 19 33 21 21 21 16 21 5 63 63 1484 1484 1483 1484 1484 1483 1286 1482 1286 1284 30 1262 27 3 1 26 26 1287 39 187 39 39 39 39 27 17 10 39 39 17 39 21 39 17 38 39 1119 1119 1120 1086 1121 1324 766 765 765 1520 197 1524 1118 1027 1118 1119 1117 1119 1116 1119 1114 196 1117 2 2 2 2 1121 612 315 1121 614 614 1122 1119 1122 1122 1120 910 1122 1120 1122 29 1118 846 1122 10 1119 6 1121 852 1120 1070 1121 1121 1120 1118 1121 1120 1122 1120 1120 1121 1122 1122 177 1122 1 766 408 593 407 408 408 407 407 2 408 401 398 399 8 1 401 670 671 649 643 231 645 526 6 667 205 520 523 27 24 70 63 12 63 10 10 1305 298 298 298 298 1305 297 10 298 1306 1304 298 1304 59 1101 941 311 312 312 59 311 312 186 133 23 186 61 185 187 187 187 187 21 12 12 186 168 169 169 169 59 169 21 169 169 632 630 630 513 18 632 17 12 773 770 629 765 764 624 846 1405 743 747 742 743 27 28 25 24 1382 1383 826 739 739 728 728 224 224 224 224 224 227 227 227 227 227 224 224 224 224 224 224 1 224 224 224 224 224 224 224 224 224 224 224 224 3 227 224 224 227 224 227 227 227 227 227 227 227 227 227 227 851 102 110 103 1 1251 1326 1329 766 765 593 1328 1327 92 17 92 17 1327 1326 844 842 844 510 439 439 507 1309 1312 1312 844 490 299 195 1242 415 1242 1242 1058 1242 1054 76 77 1249 83 48 48 35 48 1250 1059 1251 354 122 359 354 353 1 354 354 352 354 853 511 353 122 1330 852 1329 1329 1330 360 1320 862 1321 1321 1312 1250 354 1251 354 353 353 1326 77 2 110 26 1332 1331 1331 627 1328 1326 1327 11 11 7 7 6 3 1 2 3 5 5 5 5 6 1 6 11 587 587 587 587 587 587 12 587 586 18 16 16 27 22 22 22 14 6 4 13 21 21 21 21 21 21 18 16 6 16 16 16 16 16 21 27 206 206 205 204 18 205 203 145 146 146 146 146 146 146 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 | // SPDX-License-Identifier: GPL-2.0-or-later /* * VMA-specific functions. */ #include "vma_internal.h" #include "vma.h" struct mmap_state { struct mm_struct *mm; struct vma_iterator *vmi; unsigned long addr; unsigned long end; pgoff_t pgoff; unsigned long pglen; vm_flags_t vm_flags; struct file *file; pgprot_t page_prot; /* User-defined fields, perhaps updated by .mmap_prepare(). */ const struct vm_operations_struct *vm_ops; void *vm_private_data; unsigned long charged; struct vm_area_struct *prev; struct vm_area_struct *next; /* Unmapping state. */ struct vma_munmap_struct vms; struct ma_state mas_detach; struct maple_tree mt_detach; /* Determine if we can check KSM flags early in mmap() logic. */ bool check_ksm_early :1; /* If we map new, hold the file rmap lock on mapping. */ bool hold_file_rmap_lock :1; /* If .mmap_prepare changed the file, we don't need to pin. */ bool file_doesnt_need_get :1; }; #define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, vm_flags_, file_) \ struct mmap_state name = { \ .mm = mm_, \ .vmi = vmi_, \ .addr = addr_, \ .end = (addr_) + (len_), \ .pgoff = pgoff_, \ .pglen = PHYS_PFN(len_), \ .vm_flags = vm_flags_, \ .file = file_, \ .page_prot = vm_get_page_prot(vm_flags_), \ } #define VMG_MMAP_STATE(name, map_, vma_) \ struct vma_merge_struct name = { \ .mm = (map_)->mm, \ .vmi = (map_)->vmi, \ .start = (map_)->addr, \ .end = (map_)->end, \ .vm_flags = (map_)->vm_flags, \ .pgoff = (map_)->pgoff, \ .file = (map_)->file, \ .prev = (map_)->prev, \ .middle = vma_, \ .next = (vma_) ? NULL : (map_)->next, \ .state = VMA_MERGE_START, \ } /* Was this VMA ever forked from a parent, i.e. maybe contains CoW mappings? */ static bool vma_is_fork_child(struct vm_area_struct *vma) { /* * The list_is_singular() test is to avoid merging VMA cloned from * parents. This can improve scalability caused by the anon_vma root * lock. */ return vma && vma->anon_vma && !list_is_singular(&vma->anon_vma_chain); } static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next) { struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev; if (!mpol_equal(vmg->policy, vma_policy(vma))) return false; if ((vma->vm_flags ^ vmg->vm_flags) & ~VM_IGNORE_MERGE) return false; if (vma->vm_file != vmg->file) return false; if (!is_mergeable_vm_userfaultfd_ctx(vma, vmg->uffd_ctx)) return false; if (!anon_vma_name_eq(anon_vma_name(vma), vmg->anon_name)) return false; return true; } static bool is_mergeable_anon_vma(struct vma_merge_struct *vmg, bool merge_next) { struct vm_area_struct *tgt = merge_next ? vmg->next : vmg->prev; struct vm_area_struct *src = vmg->middle; /* existing merge case. */ struct anon_vma *tgt_anon = tgt->anon_vma; struct anon_vma *src_anon = vmg->anon_vma; /* * We _can_ have !src, vmg->anon_vma via copy_vma(). In this instance we * will remove the existing VMA's anon_vma's so there's no scalability * concerns. */ VM_WARN_ON(src && src_anon != src->anon_vma); /* Case 1 - we will dup_anon_vma() from src into tgt. */ if (!tgt_anon && src_anon) { struct vm_area_struct *copied_from = vmg->copied_from; if (vma_is_fork_child(src)) return false; if (vma_is_fork_child(copied_from)) return false; return true; } /* Case 2 - we will simply use tgt's anon_vma. */ if (tgt_anon && !src_anon) return !vma_is_fork_child(tgt); /* Case 3 - the anon_vma's are already shared. */ return src_anon == tgt_anon; } /* * init_multi_vma_prep() - Initializer for struct vma_prepare * @vp: The vma_prepare struct * @vma: The vma that will be altered once locked * @vmg: The merge state that will be used to determine adjustment and VMA * removal. */ static void init_multi_vma_prep(struct vma_prepare *vp, struct vm_area_struct *vma, struct vma_merge_struct *vmg) { struct vm_area_struct *adjust; struct vm_area_struct **remove = &vp->remove; memset(vp, 0, sizeof(struct vma_prepare)); vp->vma = vma; vp->anon_vma = vma->anon_vma; if (vmg && vmg->__remove_middle) { *remove = vmg->middle; remove = &vp->remove2; } if (vmg && vmg->__remove_next) *remove = vmg->next; if (vmg && vmg->__adjust_middle_start) adjust = vmg->middle; else if (vmg && vmg->__adjust_next_start) adjust = vmg->next; else adjust = NULL; vp->adj_next = adjust; if (!vp->anon_vma && adjust) vp->anon_vma = adjust->anon_vma; VM_WARN_ON(vp->anon_vma && adjust && adjust->anon_vma && vp->anon_vma != adjust->anon_vma); vp->file = vma->vm_file; if (vp->file) vp->mapping = vma->vm_file->f_mapping; if (vmg && vmg->skip_vma_uprobe) vp->skip_vma_uprobe = true; } /* * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) * in front of (at a lower virtual address and file offset than) the vma. * * We cannot merge two vmas if they have differently assigned (non-NULL) * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. * * We don't check here for the merged mmap wrapping around the end of pagecache * indices (16TB on ia32) because do_mmap() does not permit mmap's which * wrap, nor mmaps which cover the final page at index -1UL. * * We assume the vma may be removed as part of the merge. */ static bool can_vma_merge_before(struct vma_merge_struct *vmg) { pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); if (is_mergeable_vma(vmg, /* merge_next = */ true) && is_mergeable_anon_vma(vmg, /* merge_next = */ true)) { if (vmg->next->vm_pgoff == vmg->pgoff + pglen) return true; } return false; } /* * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) * beyond (at a higher virtual address and file offset than) the vma. * * We cannot merge two vmas if they have differently assigned (non-NULL) * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. * * We assume that vma is not removed as part of the merge. */ static bool can_vma_merge_after(struct vma_merge_struct *vmg) { if (is_mergeable_vma(vmg, /* merge_next = */ false) && is_mergeable_anon_vma(vmg, /* merge_next = */ false)) { if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff) return true; } return false; } static void __vma_link_file(struct vm_area_struct *vma, struct address_space *mapping) { if (vma_is_shared_maywrite(vma)) mapping_allow_writable(mapping); flush_dcache_mmap_lock(mapping); vma_interval_tree_insert(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); } /* * Requires inode->i_mapping->i_mmap_rwsem */ static void __remove_shared_vm_struct(struct vm_area_struct *vma, struct address_space *mapping) { if (vma_is_shared_maywrite(vma)) mapping_unmap_writable(mapping); flush_dcache_mmap_lock(mapping); vma_interval_tree_remove(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); } /* * vma has some anon_vma assigned, and is already inserted on that * anon_vma's interval trees. * * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the * vma must be removed from the anon_vma's interval trees using * anon_vma_interval_tree_pre_update_vma(). * * After the update, the vma will be reinserted using * anon_vma_interval_tree_post_update_vma(). * * The entire update must be protected by exclusive mmap_lock and by * the root anon_vma's mutex. */ static void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) { struct anon_vma_chain *avc; list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); } static void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) { struct anon_vma_chain *avc; list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); } /* * vma_prepare() - Helper function for handling locking VMAs prior to altering * @vp: The initialized vma_prepare struct */ static void vma_prepare(struct vma_prepare *vp) { if (vp->file) { uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); if (vp->adj_next) uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, vp->adj_next->vm_end); i_mmap_lock_write(vp->mapping); if (vp->insert && vp->insert->vm_file) { /* * Put into interval tree now, so instantiated pages * are visible to arm/parisc __flush_dcache_page * throughout; but we cannot insert into address * space until vma start or end is updated. */ __vma_link_file(vp->insert, vp->insert->vm_file->f_mapping); } } if (vp->anon_vma) { anon_vma_lock_write(vp->anon_vma); anon_vma_interval_tree_pre_update_vma(vp->vma); if (vp->adj_next) anon_vma_interval_tree_pre_update_vma(vp->adj_next); } if (vp->file) { flush_dcache_mmap_lock(vp->mapping); vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); if (vp->adj_next) vma_interval_tree_remove(vp->adj_next, &vp->mapping->i_mmap); } } /* * vma_complete- Helper function for handling the unlocking after altering VMAs, * or for inserting a VMA. * * @vp: The vma_prepare struct * @vmi: The vma iterator * @mm: The mm_struct */ static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi, struct mm_struct *mm) { if (vp->file) { if (vp->adj_next) vma_interval_tree_insert(vp->adj_next, &vp->mapping->i_mmap); vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); flush_dcache_mmap_unlock(vp->mapping); } if (vp->remove && vp->file) { __remove_shared_vm_struct(vp->remove, vp->mapping); if (vp->remove2) __remove_shared_vm_struct(vp->remove2, vp->mapping); } else if (vp->insert) { /* * split_vma has split insert from vma, and needs * us to insert it before dropping the locks * (it may either follow vma or precede it). */ vma_iter_store_new(vmi, vp->insert); mm->map_count++; } if (vp->anon_vma) { anon_vma_interval_tree_post_update_vma(vp->vma); if (vp->adj_next) anon_vma_interval_tree_post_update_vma(vp->adj_next); anon_vma_unlock_write(vp->anon_vma); } if (vp->file) { i_mmap_unlock_write(vp->mapping); if (!vp->skip_vma_uprobe) { uprobe_mmap(vp->vma); if (vp->adj_next) uprobe_mmap(vp->adj_next); } } if (vp->remove) { again: vma_mark_detached(vp->remove); if (vp->file) { uprobe_munmap(vp->remove, vp->remove->vm_start, vp->remove->vm_end); fput(vp->file); } if (vp->remove->anon_vma) anon_vma_merge(vp->vma, vp->remove); mm->map_count--; mpol_put(vma_policy(vp->remove)); if (!vp->remove2) WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); vm_area_free(vp->remove); /* * In mprotect's case 6 (see comments on vma_merge), * we are removing both mid and next vmas */ if (vp->remove2) { vp->remove = vp->remove2; vp->remove2 = NULL; goto again; } } if (vp->insert && vp->file) uprobe_mmap(vp->insert); } /* * init_vma_prep() - Initializer wrapper for vma_prepare struct * @vp: The vma_prepare struct * @vma: The vma that will be altered once locked */ static void init_vma_prep(struct vma_prepare *vp, struct vm_area_struct *vma) { init_multi_vma_prep(vp, vma, NULL); } /* * Can the proposed VMA be merged with the left (previous) VMA taking into * account the start position of the proposed range. */ static bool can_vma_merge_left(struct vma_merge_struct *vmg) { return vmg->prev && vmg->prev->vm_end == vmg->start && can_vma_merge_after(vmg); } /* * Can the proposed VMA be merged with the right (next) VMA taking into * account the end position of the proposed range. * * In addition, if we can merge with the left VMA, ensure that left and right * anon_vma's are also compatible. */ static bool can_vma_merge_right(struct vma_merge_struct *vmg, bool can_merge_left) { struct vm_area_struct *next = vmg->next; struct vm_area_struct *prev; if (!next || vmg->end != next->vm_start || !can_vma_merge_before(vmg)) return false; if (!can_merge_left) return true; /* * If we can merge with prev (left) and next (right), indicating that * each VMA's anon_vma is compatible with the proposed anon_vma, this * does not mean prev and next are compatible with EACH OTHER. * * We therefore check this in addition to mergeability to either side. */ prev = vmg->prev; return !prev->anon_vma || !next->anon_vma || prev->anon_vma == next->anon_vma; } /* * Close a vm structure and free it. */ void remove_vma(struct vm_area_struct *vma) { might_sleep(); vma_close(vma); if (vma->vm_file) fput(vma->vm_file); mpol_put(vma_policy(vma)); vm_area_free(vma); } /* * Get rid of page table information in the indicated region. * * Called with the mm semaphore held. */ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next) { struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end); mas_set(mas, vma->vm_end); free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, next ? next->vm_start : USER_PGTABLES_CEILING, /* mm_wr_locked = */ true); tlb_finish_mmu(&tlb); } /* * __split_vma() bypasses sysctl_max_map_count checking. We use this where it * has already been checked or doesn't make sense to fail. * VMA Iterator will point to the original VMA. */ static __must_check int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { struct vma_prepare vp; struct vm_area_struct *new; int err; WARN_ON(vma->vm_start >= addr); WARN_ON(vma->vm_end <= addr); if (vma->vm_ops && vma->vm_ops->may_split) { err = vma->vm_ops->may_split(vma, addr); if (err) return err; } new = vm_area_dup(vma); if (!new) return -ENOMEM; if (new_below) { new->vm_end = addr; } else { new->vm_start = addr; new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); } err = -ENOMEM; vma_iter_config(vmi, new->vm_start, new->vm_end); if (vma_iter_prealloc(vmi, new)) goto out_free_vma; err = vma_dup_policy(vma, new); if (err) goto out_free_vmi; err = anon_vma_clone(new, vma); if (err) goto out_free_mpol; if (new->vm_file) get_file(new->vm_file); if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); vma_start_write(vma); vma_start_write(new); init_vma_prep(&vp, vma); vp.insert = new; vma_prepare(&vp); /* * Get rid of huge pages and shared page tables straddling the split * boundary. */ vma_adjust_trans_huge(vma, vma->vm_start, addr, NULL); if (is_vm_hugetlb_page(vma)) hugetlb_split(vma, addr); if (new_below) { vma->vm_start = addr; vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT; } else { vma->vm_end = addr; } /* vma_complete stores the new vma */ vma_complete(&vp, vmi, vma->vm_mm); validate_mm(vma->vm_mm); /* Success. */ if (new_below) vma_next(vmi); else vma_prev(vmi); return 0; out_free_mpol: mpol_put(vma_policy(new)); out_free_vmi: vma_iter_free(vmi); out_free_vma: vm_area_free(new); return err; } /* * Split a vma into two pieces at address 'addr', a new vma is allocated * either for the first part or the tail. */ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { if (vma->vm_mm->map_count >= sysctl_max_map_count) return -ENOMEM; return __split_vma(vmi, vma, addr, new_below); } /* * dup_anon_vma() - Helper function to duplicate anon_vma on VMA merge in the * instance that the destination VMA has no anon_vma but the source does. * * @dst: The destination VMA * @src: The source VMA * @dup: Pointer to the destination VMA when successful. * * Returns: 0 on success. */ static int dup_anon_vma(struct vm_area_struct *dst, struct vm_area_struct *src, struct vm_area_struct **dup) { /* * There are three cases to consider for correctly propagating * anon_vma's on merge. * * The first is trivial - neither VMA has anon_vma, we need not do * anything. * * The second where both have anon_vma is also a no-op, as they must * then be the same, so there is simply nothing to copy. * * Here we cover the third - if the destination VMA has no anon_vma, * that is it is unfaulted, we need to ensure that the newly merged * range is referenced by the anon_vma's of the source. */ if (src->anon_vma && !dst->anon_vma) { int ret; vma_assert_write_locked(dst); dst->anon_vma = src->anon_vma; ret = anon_vma_clone(dst, src); if (ret) return ret; *dup = dst; } return 0; } #ifdef CONFIG_DEBUG_VM_MAPLE_TREE void validate_mm(struct mm_struct *mm) { int bug = 0; int i = 0; struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, 0); mt_validate(&mm->mm_mt); for_each_vma(vmi, vma) { #ifdef CONFIG_DEBUG_VM_RB struct anon_vma *anon_vma = vma->anon_vma; struct anon_vma_chain *avc; #endif unsigned long vmi_start, vmi_end; bool warn = 0; vmi_start = vma_iter_addr(&vmi); vmi_end = vma_iter_end(&vmi); if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm)) warn = 1; if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm)) warn = 1; if (warn) { pr_emerg("issue in %s\n", current->comm); dump_stack(); dump_vma(vma); pr_emerg("tree range: %px start %lx end %lx\n", vma, vmi_start, vmi_end - 1); vma_iter_dump_tree(&vmi); } #ifdef CONFIG_DEBUG_VM_RB if (anon_vma) { anon_vma_lock_read(anon_vma); list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) anon_vma_interval_tree_verify(avc); anon_vma_unlock_read(anon_vma); } #endif /* Check for a infinite loop */ if (++i > mm->map_count + 10) { i = -1; break; } } if (i != mm->map_count) { pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i); bug = 1; } VM_BUG_ON_MM(bug, mm); } #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ /* * Based on the vmg flag indicating whether we need to adjust the vm_start field * for the middle or next VMA, we calculate what the range of the newly adjusted * VMA ought to be, and set the VMA's range accordingly. */ static void vmg_adjust_set_range(struct vma_merge_struct *vmg) { struct vm_area_struct *adjust; pgoff_t pgoff; if (vmg->__adjust_middle_start) { adjust = vmg->middle; pgoff = adjust->vm_pgoff + PHYS_PFN(vmg->end - adjust->vm_start); } else if (vmg->__adjust_next_start) { adjust = vmg->next; pgoff = adjust->vm_pgoff - PHYS_PFN(adjust->vm_start - vmg->end); } else { return; } vma_set_range(adjust, vmg->end, adjust->vm_end, pgoff); } /* * Actually perform the VMA merge operation. * * IMPORTANT: We guarantee that, should vmg->give_up_on_oom is set, to not * modify any VMAs or cause inconsistent state should an OOM condition arise. * * Returns 0 on success, or an error value on failure. */ static int commit_merge(struct vma_merge_struct *vmg) { struct vm_area_struct *vma; struct vma_prepare vp; if (vmg->__adjust_next_start) { /* We manipulate middle and adjust next, which is the target. */ vma = vmg->middle; vma_iter_config(vmg->vmi, vmg->end, vmg->next->vm_end); } else { vma = vmg->target; /* Note: vma iterator must be pointing to 'start'. */ vma_iter_config(vmg->vmi, vmg->start, vmg->end); } init_multi_vma_prep(&vp, vma, vmg); /* * If vmg->give_up_on_oom is set, we're safe, because we don't actually * manipulate any VMAs until we succeed at preallocation. * * Past this point, we will not return an error. */ if (vma_iter_prealloc(vmg->vmi, vma)) return -ENOMEM; vma_prepare(&vp); /* * THP pages may need to do additional splits if we increase * middle->vm_start. */ vma_adjust_trans_huge(vma, vmg->start, vmg->end, vmg->__adjust_middle_start ? vmg->middle : NULL); vma_set_range(vma, vmg->start, vmg->end, vmg->pgoff); vmg_adjust_set_range(vmg); vma_iter_store_overwrite(vmg->vmi, vmg->target); vma_complete(&vp, vmg->vmi, vma->vm_mm); return 0; } /* We can only remove VMAs when merging if they do not have a close hook. */ static bool can_merge_remove_vma(struct vm_area_struct *vma) { return !vma->vm_ops || !vma->vm_ops->close; } /* * vma_merge_existing_range - Attempt to merge VMAs based on a VMA having its * attributes modified. * * @vmg: Describes the modifications being made to a VMA and associated * metadata. * * When the attributes of a range within a VMA change, then it might be possible * for immediately adjacent VMAs to be merged into that VMA due to having * identical properties. * * This function checks for the existence of any such mergeable VMAs and updates * the maple tree describing the @vmg->middle->vm_mm address space to account * for this, as well as any VMAs shrunk/expanded/deleted as a result of this * merge. * * As part of this operation, if a merge occurs, the @vmg object will have its * vma, start, end, and pgoff fields modified to execute the merge. Subsequent * calls to this function should reset these fields. * * Returns: The merged VMA if merge succeeds, or NULL otherwise. * * ASSUMPTIONS: * - The caller must assign the VMA to be modified to @vmg->middle. * - The caller must have set @vmg->prev to the previous VMA, if there is one. * - The caller must not set @vmg->next, as we determine this. * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. * - vmi must be positioned within [@vmg->middle->vm_start, @vmg->middle->vm_end). */ static __must_check struct vm_area_struct *vma_merge_existing_range( struct vma_merge_struct *vmg) { vm_flags_t sticky_flags = vmg->vm_flags & VM_STICKY; struct vm_area_struct *middle = vmg->middle; struct vm_area_struct *prev = vmg->prev; struct vm_area_struct *next; struct vm_area_struct *anon_dup = NULL; unsigned long start = vmg->start; unsigned long end = vmg->end; bool left_side = middle && start == middle->vm_start; bool right_side = middle && end == middle->vm_end; int err = 0; bool merge_left, merge_right, merge_both; mmap_assert_write_locked(vmg->mm); VM_WARN_ON_VMG(!middle, vmg); /* We are modifying a VMA, so caller must specify. */ VM_WARN_ON_VMG(vmg->next, vmg); /* We set this. */ VM_WARN_ON_VMG(prev && start <= prev->vm_start, vmg); VM_WARN_ON_VMG(start >= end, vmg); /* * If middle == prev, then we are offset into a VMA. Otherwise, if we are * not, we must span a portion of the VMA. */ VM_WARN_ON_VMG(middle && ((middle != prev && vmg->start != middle->vm_start) || vmg->end > middle->vm_end), vmg); /* The vmi must be positioned within vmg->middle. */ VM_WARN_ON_VMG(middle && !(vma_iter_addr(vmg->vmi) >= middle->vm_start && vma_iter_addr(vmg->vmi) < middle->vm_end), vmg); /* An existing merge can never be used by the mremap() logic. */ VM_WARN_ON_VMG(vmg->copied_from, vmg); vmg->state = VMA_MERGE_NOMERGE; /* * If a special mapping or if the range being modified is neither at the * furthermost left or right side of the VMA, then we have no chance of * merging and should abort. */ if (vmg->vm_flags & VM_SPECIAL || (!left_side && !right_side)) return NULL; if (left_side) merge_left = can_vma_merge_left(vmg); else merge_left = false; if (right_side) { next = vmg->next = vma_iter_next_range(vmg->vmi); vma_iter_prev_range(vmg->vmi); merge_right = can_vma_merge_right(vmg, merge_left); } else { merge_right = false; next = NULL; } if (merge_left) /* If merging prev, position iterator there. */ vma_prev(vmg->vmi); else if (!merge_right) /* If we have nothing to merge, abort. */ return NULL; merge_both = merge_left && merge_right; /* If we span the entire VMA, a merge implies it will be deleted. */ vmg->__remove_middle = left_side && right_side; /* * If we need to remove middle in its entirety but are unable to do so, * we have no sensible recourse but to abort the merge. */ if (vmg->__remove_middle && !can_merge_remove_vma(middle)) return NULL; /* * If we merge both VMAs, then next is also deleted. This implies * merge_will_delete_vma also. */ vmg->__remove_next = merge_both; /* * If we cannot delete next, then we can reduce the operation to merging * prev and middle (thereby deleting middle). */ if (vmg->__remove_next && !can_merge_remove_vma(next)) { vmg->__remove_next = false; merge_right = false; merge_both = false; } /* No matter what happens, we will be adjusting middle. */ vma_start_write(middle); if (merge_right) { vma_start_write(next); vmg->target = next; sticky_flags |= (next->vm_flags & VM_STICKY); } if (merge_left) { vma_start_write(prev); vmg->target = prev; sticky_flags |= (prev->vm_flags & VM_STICKY); } if (merge_both) { /* * |<-------------------->| * |-------********-------| * prev middle next * extend delete delete */ vmg->start = prev->vm_start; vmg->end = next->vm_end; vmg->pgoff = prev->vm_pgoff; /* * We already ensured anon_vma compatibility above, so now it's * simply a case of, if prev has no anon_vma object, which of * next or middle contains the anon_vma we must duplicate. */ err = dup_anon_vma(prev, next->anon_vma ? next : middle, &anon_dup); } else if (merge_left) { /* * |<------------>| OR * |<----------------->| * |-------************* * prev middle * extend shrink/delete */ vmg->start = prev->vm_start; vmg->pgoff = prev->vm_pgoff; if (!vmg->__remove_middle) vmg->__adjust_middle_start = true; err = dup_anon_vma(prev, middle, &anon_dup); } else { /* merge_right */ /* * |<------------->| OR * |<----------------->| * *************-------| * middle next * shrink/delete extend */ pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); VM_WARN_ON_VMG(!merge_right, vmg); /* If we are offset into a VMA, then prev must be middle. */ VM_WARN_ON_VMG(vmg->start > middle->vm_start && prev && middle != prev, vmg); if (vmg->__remove_middle) { vmg->end = next->vm_end; vmg->pgoff = next->vm_pgoff - pglen; } else { /* We shrink middle and expand next. */ vmg->__adjust_next_start = true; vmg->start = middle->vm_start; vmg->end = start; vmg->pgoff = middle->vm_pgoff; } err = dup_anon_vma(next, middle, &anon_dup); } if (err || commit_merge(vmg)) goto abort; vm_flags_set(vmg->target, sticky_flags); khugepaged_enter_vma(vmg->target, vmg->vm_flags); vmg->state = VMA_MERGE_SUCCESS; return vmg->target; abort: vma_iter_set(vmg->vmi, start); vma_iter_load(vmg->vmi); if (anon_dup) unlink_anon_vmas(anon_dup); /* * This means we have failed to clone anon_vma's correctly, but no * actual changes to VMAs have occurred, so no harm no foul - if the * user doesn't want this reported and instead just wants to give up on * the merge, allow it. */ if (!vmg->give_up_on_oom) vmg->state = VMA_MERGE_ERROR_NOMEM; return NULL; } /* * vma_merge_new_range - Attempt to merge a new VMA into address space * * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end * (exclusive), which we try to merge with any adjacent VMAs if possible. * * We are about to add a VMA to the address space starting at @vmg->start and * ending at @vmg->end. There are three different possible scenarios: * * 1. There is a VMA with identical properties immediately adjacent to the * proposed new VMA [@vmg->start, @vmg->end) either before or after it - * EXPAND that VMA: * * Proposed: |-----| or |-----| * Existing: |----| |----| * * 2. There are VMAs with identical properties immediately adjacent to the * proposed new VMA [@vmg->start, @vmg->end) both before AND after it - * EXPAND the former and REMOVE the latter: * * Proposed: |-----| * Existing: |----| |----| * * 3. There are no VMAs immediately adjacent to the proposed new VMA or those * VMAs do not have identical attributes - NO MERGE POSSIBLE. * * In instances where we can merge, this function returns the expanded VMA which * will have its range adjusted accordingly and the underlying maple tree also * adjusted. * * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer * to the VMA we expanded. * * This function adjusts @vmg to provide @vmg->next if not already specified, * and adjusts [@vmg->start, @vmg->end) to span the expanded range. * * ASSUMPTIONS: * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. * - The caller must have determined that [@vmg->start, @vmg->end) is empty, other than VMAs that will be unmapped should the operation succeed. * - The caller must have specified the previous vma in @vmg->prev. * - The caller must have specified the next vma in @vmg->next. * - The caller must have positioned the vmi at or before the gap. */ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) { struct vm_area_struct *prev = vmg->prev; struct vm_area_struct *next = vmg->next; unsigned long end = vmg->end; bool can_merge_left, can_merge_right; mmap_assert_write_locked(vmg->mm); VM_WARN_ON_VMG(vmg->middle, vmg); VM_WARN_ON_VMG(vmg->target, vmg); /* vmi must point at or before the gap. */ VM_WARN_ON_VMG(vma_iter_addr(vmg->vmi) > end, vmg); vmg->state = VMA_MERGE_NOMERGE; /* Special VMAs are unmergeable, also if no prev/next. */ if ((vmg->vm_flags & VM_SPECIAL) || (!prev && !next)) return NULL; can_merge_left = can_vma_merge_left(vmg); can_merge_right = !vmg->just_expand && can_vma_merge_right(vmg, can_merge_left); /* If we can merge with the next VMA, adjust vmg accordingly. */ if (can_merge_right) { vmg->end = next->vm_end; vmg->target = next; } /* If we can merge with the previous VMA, adjust vmg accordingly. */ if (can_merge_left) { vmg->start = prev->vm_start; vmg->target = prev; vmg->pgoff = prev->vm_pgoff; /* * If this merge would result in removal of the next VMA but we * are not permitted to do so, reduce the operation to merging * prev and vma. */ if (can_merge_right && !can_merge_remove_vma(next)) vmg->end = end; /* In expand-only case we are already positioned at prev. */ if (!vmg->just_expand) { /* Equivalent to going to the previous range. */ vma_prev(vmg->vmi); } } /* * Now try to expand adjacent VMA(s). This takes care of removing the * following VMA if we have VMAs on both sides. */ if (vmg->target && !vma_expand(vmg)) { khugepaged_enter_vma(vmg->target, vmg->vm_flags); vmg->state = VMA_MERGE_SUCCESS; return vmg->target; } return NULL; } /* * vma_merge_copied_range - Attempt to merge a VMA that is being copied by * mremap() * * @vmg: Describes the VMA we are adding, in the copied-to range @vmg->start to * @vmg->end (exclusive), which we try to merge with any adjacent VMAs if * possible. * * vmg->prev, next, start, end, pgoff should all be relative to the COPIED TO * range, i.e. the target range for the VMA. * * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer * to the VMA we expanded. * * ASSUMPTIONS: Same as vma_merge_new_range(), except vmg->middle must contain * the copied-from VMA. */ static struct vm_area_struct *vma_merge_copied_range(struct vma_merge_struct *vmg) { /* We must have a copied-from VMA. */ VM_WARN_ON_VMG(!vmg->middle, vmg); vmg->copied_from = vmg->middle; vmg->middle = NULL; return vma_merge_new_range(vmg); } /* * vma_expand - Expand an existing VMA * * @vmg: Describes a VMA expansion operation. * * Expand @vma to vmg->start and vmg->end. Can expand off the start and end. * Will expand over vmg->next if it's different from vmg->target and vmg->end == * vmg->next->vm_end. Checking if the vmg->target can expand and merge with * vmg->next needs to be handled by the caller. * * Returns: 0 on success. * * ASSUMPTIONS: * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. * - The caller must have set @vmg->target and @vmg->next. */ int vma_expand(struct vma_merge_struct *vmg) { struct vm_area_struct *anon_dup = NULL; struct vm_area_struct *target = vmg->target; struct vm_area_struct *next = vmg->next; bool remove_next = false; vm_flags_t sticky_flags; int ret = 0; mmap_assert_write_locked(vmg->mm); vma_start_write(target); if (next && target != next && vmg->end == next->vm_end) remove_next = true; /* We must have a target. */ VM_WARN_ON_VMG(!target, vmg); /* This should have already been checked by this point. */ VM_WARN_ON_VMG(remove_next && !can_merge_remove_vma(next), vmg); /* Not merging but overwriting any part of next is not handled. */ VM_WARN_ON_VMG(next && !remove_next && next != target && vmg->end > next->vm_start, vmg); /* Only handles expanding. */ VM_WARN_ON_VMG(target->vm_start < vmg->start || target->vm_end > vmg->end, vmg); sticky_flags = vmg->vm_flags & VM_STICKY; sticky_flags |= target->vm_flags & VM_STICKY; if (remove_next) sticky_flags |= next->vm_flags & VM_STICKY; /* * If we are removing the next VMA or copying from a VMA * (e.g. mremap()'ing), we must propagate anon_vma state. * * Note that, by convention, callers ignore OOM for this case, so * we don't need to account for vmg->give_up_on_mm here. */ if (remove_next) ret = dup_anon_vma(target, next, &anon_dup); if (!ret && vmg->copied_from) ret = dup_anon_vma(target, vmg->copied_from, &anon_dup); if (ret) return ret; if (remove_next) { vma_start_write(next); vmg->__remove_next = true; } if (commit_merge(vmg)) goto nomem; vm_flags_set(target, sticky_flags); return 0; nomem: if (anon_dup) unlink_anon_vmas(anon_dup); /* * If the user requests that we just give upon OOM, we are safe to do so * here, as commit merge provides this contract to us. Nothing has been * changed - no harm no foul, just don't report it. */ if (!vmg->give_up_on_oom) vmg->state = VMA_MERGE_ERROR_NOMEM; return -ENOMEM; } /* * vma_shrink() - Reduce an existing VMAs memory area * @vmi: The vma iterator * @vma: The VMA to modify * @start: The new start * @end: The new end * * Returns: 0 on success, -ENOMEM otherwise */ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff) { struct vma_prepare vp; WARN_ON((vma->vm_start != start) && (vma->vm_end != end)); if (vma->vm_start < start) vma_iter_config(vmi, vma->vm_start, start); else vma_iter_config(vmi, end, vma->vm_end); if (vma_iter_prealloc(vmi, NULL)) return -ENOMEM; vma_start_write(vma); init_vma_prep(&vp, vma); vma_prepare(&vp); vma_adjust_trans_huge(vma, start, end, NULL); vma_iter_clear(vmi); vma_set_range(vma, start, end, pgoff); vma_complete(&vp, vmi, vma->vm_mm); validate_mm(vma->vm_mm); return 0; } static inline void vms_clear_ptes(struct vma_munmap_struct *vms, struct ma_state *mas_detach, bool mm_wr_locked) { struct mmu_gather tlb; if (!vms->clear_ptes) /* Nothing to do */ return; /* * We can free page tables without write-locking mmap_lock because VMAs * were isolated before we downgraded mmap_lock. */ mas_set(mas_detach, 1); tlb_gather_mmu(&tlb, vms->vma->vm_mm); update_hiwater_rss(vms->vma->vm_mm); unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end, vms->vma_count); mas_set(mas_detach, 1); /* start and end may be different if there is no prev or next vma. */ free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start, vms->unmap_end, mm_wr_locked); tlb_finish_mmu(&tlb); vms->clear_ptes = false; } static void vms_clean_up_area(struct vma_munmap_struct *vms, struct ma_state *mas_detach) { struct vm_area_struct *vma; if (!vms->nr_pages) return; vms_clear_ptes(vms, mas_detach, true); mas_set(mas_detach, 0); mas_for_each(mas_detach, vma, ULONG_MAX) vma_close(vma); } /* * vms_complete_munmap_vmas() - Finish the munmap() operation * @vms: The vma munmap struct * @mas_detach: The maple state of the detached vmas * * This updates the mm_struct, unmaps the region, frees the resources * used for the munmap() and may downgrade the lock - if requested. Everything * needed to be done once the vma maple tree is updated. */ static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms, struct ma_state *mas_detach) { struct vm_area_struct *vma; struct mm_struct *mm; mm = current->mm; mm->map_count -= vms->vma_count; mm->locked_vm -= vms->locked_vm; if (vms->unlock) mmap_write_downgrade(mm); if (!vms->nr_pages) return; vms_clear_ptes(vms, mas_detach, !vms->unlock); /* Update high watermark before we lower total_vm */ update_hiwater_vm(mm); /* Stat accounting */ WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm) - vms->nr_pages); /* Paranoid bookkeeping */ VM_WARN_ON(vms->exec_vm > mm->exec_vm); VM_WARN_ON(vms->stack_vm > mm->stack_vm); VM_WARN_ON(vms->data_vm > mm->data_vm); mm->exec_vm -= vms->exec_vm; mm->stack_vm -= vms->stack_vm; mm->data_vm -= vms->data_vm; /* Remove and clean up vmas */ mas_set(mas_detach, 0); mas_for_each(mas_detach, vma, ULONG_MAX) remove_vma(vma); vm_unacct_memory(vms->nr_accounted); validate_mm(mm); if (vms->unlock) mmap_read_unlock(mm); __mt_destroy(mas_detach->tree); } /* * reattach_vmas() - Undo any munmap work and free resources * @mas_detach: The maple state with the detached maple tree * * Reattach any detached vmas and free up the maple tree used to track the vmas. */ static void reattach_vmas(struct ma_state *mas_detach) { struct vm_area_struct *vma; mas_set(mas_detach, 0); mas_for_each(mas_detach, vma, ULONG_MAX) vma_mark_attached(vma); __mt_destroy(mas_detach->tree); } /* * vms_gather_munmap_vmas() - Put all VMAs within a range into a maple tree * for removal at a later date. Handles splitting first and last if necessary * and marking the vmas as isolated. * * @vms: The vma munmap struct * @mas_detach: The maple state tracking the detached tree * * Return: 0 on success, error otherwise */ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, struct ma_state *mas_detach) { struct vm_area_struct *next = NULL; int error; /* * If we need to split any vma, do it now to save pain later. * Does it split the first one? */ if (vms->start > vms->vma->vm_start) { /* * Make sure that map_count on return from munmap() will * not exceed its limit; but let map_count go just above * its limit temporarily, to help free resources as expected. */ if (vms->end < vms->vma->vm_end && vms->vma->vm_mm->map_count >= sysctl_max_map_count) { error = -ENOMEM; goto map_count_exceeded; } /* Don't bother splitting the VMA if we can't unmap it anyway */ if (vma_is_sealed(vms->vma)) { error = -EPERM; goto start_split_failed; } error = __split_vma(vms->vmi, vms->vma, vms->start, 1); if (error) goto start_split_failed; } vms->prev = vma_prev(vms->vmi); if (vms->prev) vms->unmap_start = vms->prev->vm_end; /* * Detach a range of VMAs from the mm. Using next as a temp variable as * it is always overwritten. */ for_each_vma_range(*(vms->vmi), next, vms->end) { long nrpages; if (vma_is_sealed(next)) { error = -EPERM; goto modify_vma_failed; } /* Does it split the end? */ if (next->vm_end > vms->end) { error = __split_vma(vms->vmi, next, vms->end, 0); if (error) goto end_split_failed; } vma_start_write(next); mas_set(mas_detach, vms->vma_count++); error = mas_store_gfp(mas_detach, next, GFP_KERNEL); if (error) goto munmap_gather_failed; vma_mark_detached(next); nrpages = vma_pages(next); vms->nr_pages += nrpages; if (next->vm_flags & VM_LOCKED) vms->locked_vm += nrpages; if (next->vm_flags & VM_ACCOUNT) vms->nr_accounted += nrpages; if (is_exec_mapping(next->vm_flags)) vms->exec_vm += nrpages; else if (is_stack_mapping(next->vm_flags)) vms->stack_vm += nrpages; else if (is_data_mapping(next->vm_flags)) vms->data_vm += nrpages; if (vms->uf) { /* * If userfaultfd_unmap_prep returns an error the vmas * will remain split, but userland will get a * highly unexpected error anyway. This is no * different than the case where the first of the two * __split_vma fails, but we don't undo the first * split, despite we could. This is unlikely enough * failure that it's not worth optimizing it for. */ error = userfaultfd_unmap_prep(next, vms->start, vms->end, vms->uf); if (error) goto userfaultfd_error; } #ifdef CONFIG_DEBUG_VM_MAPLE_TREE BUG_ON(next->vm_start < vms->start); BUG_ON(next->vm_start > vms->end); #endif } vms->next = vma_next(vms->vmi); if (vms->next) vms->unmap_end = vms->next->vm_start; #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) /* Make sure no VMAs are about to be lost. */ { MA_STATE(test, mas_detach->tree, 0, 0); struct vm_area_struct *vma_mas, *vma_test; int test_count = 0; vma_iter_set(vms->vmi, vms->start); rcu_read_lock(); vma_test = mas_find(&test, vms->vma_count - 1); for_each_vma_range(*(vms->vmi), vma_mas, vms->end) { BUG_ON(vma_mas != vma_test); test_count++; vma_test = mas_next(&test, vms->vma_count - 1); } rcu_read_unlock(); BUG_ON(vms->vma_count != test_count); } #endif while (vma_iter_addr(vms->vmi) > vms->start) vma_iter_prev_range(vms->vmi); vms->clear_ptes = true; return 0; userfaultfd_error: munmap_gather_failed: end_split_failed: modify_vma_failed: reattach_vmas(mas_detach); start_split_failed: map_count_exceeded: return error; } /* * init_vma_munmap() - Initializer wrapper for vma_munmap_struct * @vms: The vma munmap struct * @vmi: The vma iterator * @vma: The first vm_area_struct to munmap * @start: The aligned start address to munmap * @end: The aligned end address to munmap * @uf: The userfaultfd list_head * @unlock: Unlock after the operation. Only unlocked on success */ static void init_vma_munmap(struct vma_munmap_struct *vms, struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct list_head *uf, bool unlock) { vms->vmi = vmi; vms->vma = vma; if (vma) { vms->start = start; vms->end = end; } else { vms->start = vms->end = 0; } vms->unlock = unlock; vms->uf = uf; vms->vma_count = 0; vms->nr_pages = vms->locked_vm = vms->nr_accounted = 0; vms->exec_vm = vms->stack_vm = vms->data_vm = 0; vms->unmap_start = FIRST_USER_ADDRESS; vms->unmap_end = USER_PGTABLES_CEILING; vms->clear_ptes = false; } /* * do_vmi_align_munmap() - munmap the aligned region from @start to @end. * @vmi: The vma iterator * @vma: The starting vm_area_struct * @mm: The mm_struct * @start: The aligned start address to munmap. * @end: The aligned end address to munmap. * @uf: The userfaultfd list_head * @unlock: Set to true to drop the mmap_lock. unlocking only happens on * success. * * Return: 0 on success and drops the lock if so directed, error and leaves the * lock held otherwise. */ int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end, struct list_head *uf, bool unlock) { struct maple_tree mt_detach; MA_STATE(mas_detach, &mt_detach, 0, 0); mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); mt_on_stack(mt_detach); struct vma_munmap_struct vms; int error; init_vma_munmap(&vms, vmi, vma, start, end, uf, unlock); error = vms_gather_munmap_vmas(&vms, &mas_detach); if (error) goto gather_failed; error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL); if (error) goto clear_tree_failed; /* Point of no return */ vms_complete_munmap_vmas(&vms, &mas_detach); return 0; clear_tree_failed: reattach_vmas(&mas_detach); gather_failed: validate_mm(mm); return error; } /* * do_vmi_munmap() - munmap a given range. * @vmi: The vma iterator * @mm: The mm_struct * @start: The start address to munmap * @len: The length of the range to munmap * @uf: The userfaultfd list_head * @unlock: set to true if the user wants to drop the mmap_lock on success * * This function takes a @mas that is either pointing to the previous VMA or set * to MA_START and sets it up to remove the mapping(s). The @len will be * aligned. * * Return: 0 on success and drops the lock if so directed, error and leaves the * lock held otherwise. */ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool unlock) { unsigned long end; struct vm_area_struct *vma; if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) return -EINVAL; end = start + PAGE_ALIGN(len); if (end == start) return -EINVAL; /* Find the first overlapping VMA */ vma = vma_find(vmi, end); if (!vma) { if (unlock) mmap_write_unlock(mm); return 0; } return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock); } /* * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd * context and anonymous VMA name within the range [start, end). * * As a result, we might be able to merge the newly modified VMA range with an * adjacent VMA with identical properties. * * If no merge is possible and the range does not span the entirety of the VMA, * we then need to split the VMA to accommodate the change. * * The function returns either the merged VMA, the original VMA if a split was * required instead, or an error if the split failed. */ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg) { struct vm_area_struct *vma = vmg->middle; unsigned long start = vmg->start; unsigned long end = vmg->end; struct vm_area_struct *merged; /* First, try to merge. */ merged = vma_merge_existing_range(vmg); if (merged) return merged; if (vmg_nomem(vmg)) return ERR_PTR(-ENOMEM); /* * Split can fail for reasons other than OOM, so if the user requests * this it's probably a mistake. */ VM_WARN_ON(vmg->give_up_on_oom && (vma->vm_start != start || vma->vm_end != end)); /* Split any preceding portion of the VMA. */ if (vma->vm_start < start) { int err = split_vma(vmg->vmi, vma, start, 1); if (err) return ERR_PTR(err); } /* Split any trailing portion of the VMA. */ if (vma->vm_end > end) { int err = split_vma(vmg->vmi, vma, end, 0); if (err) return ERR_PTR(err); } return vma; } struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, vm_flags_t *vm_flags_ptr) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); const vm_flags_t vm_flags = *vm_flags_ptr; struct vm_area_struct *ret; vmg.vm_flags = vm_flags; ret = vma_modify(&vmg); if (IS_ERR(ret)) return ret; /* * For a merge to succeed, the flags must match those * requested. However, sticky flags may have been retained, so propagate * them to the caller. */ if (vmg.state == VMA_MERGE_SUCCESS) *vm_flags_ptr = ret->vm_flags; return ret; } struct vm_area_struct *vma_modify_name(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct anon_vma_name *new_name) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); vmg.anon_name = new_name; return vma_modify(&vmg); } struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct mempolicy *new_pol) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); vmg.policy = new_pol; return vma_modify(&vmg); } struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, vm_flags_t vm_flags, struct vm_userfaultfd_ctx new_ctx, bool give_up_on_oom) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); vmg.vm_flags = vm_flags; vmg.uffd_ctx = new_ctx; if (give_up_on_oom) vmg.give_up_on_oom = true; return vma_modify(&vmg); } /* * Expand vma by delta bytes, potentially merging with an immediately adjacent * VMA with identical properties. */ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long delta) { VMG_VMA_STATE(vmg, vmi, vma, vma, vma->vm_end, vma->vm_end + delta); vmg.next = vma_iter_next_rewind(vmi, NULL); vmg.middle = NULL; /* We use the VMA to populate VMG fields only. */ return vma_merge_new_range(&vmg); } void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb) { vb->count = 0; } static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb) { struct address_space *mapping; int i; mapping = vb->vmas[0]->vm_file->f_mapping; i_mmap_lock_write(mapping); for (i = 0; i < vb->count; i++) { VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping); __remove_shared_vm_struct(vb->vmas[i], mapping); } i_mmap_unlock_write(mapping); unlink_file_vma_batch_init(vb); } void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb, struct vm_area_struct *vma) { if (vma->vm_file == NULL) return; if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) || vb->count == ARRAY_SIZE(vb->vmas)) unlink_file_vma_batch_process(vb); vb->vmas[vb->count] = vma; vb->count++; } void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb) { if (vb->count > 0) unlink_file_vma_batch_process(vb); } static void vma_link_file(struct vm_area_struct *vma, bool hold_rmap_lock) { struct file *file = vma->vm_file; struct address_space *mapping; if (file) { mapping = file->f_mapping; i_mmap_lock_write(mapping); __vma_link_file(vma, mapping); if (!hold_rmap_lock) i_mmap_unlock_write(mapping); } } static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) { VMA_ITERATOR(vmi, mm, 0); vma_iter_config(&vmi, vma->vm_start, vma->vm_end); if (vma_iter_prealloc(&vmi, vma)) return -ENOMEM; vma_start_write(vma); vma_iter_store_new(&vmi, vma); vma_link_file(vma, /* hold_rmap_lock= */false); mm->map_count++; validate_mm(mm); return 0; } /* * Copy the vma structure to a new location in the same mm, * prior to moving page table entries, to effect an mremap move. */ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, unsigned long addr, unsigned long len, pgoff_t pgoff, bool *need_rmap_locks) { struct vm_area_struct *vma = *vmap; unsigned long vma_start = vma->vm_start; struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma; bool faulted_in_anon_vma = true; VMA_ITERATOR(vmi, mm, addr); VMG_VMA_STATE(vmg, &vmi, NULL, vma, addr, addr + len); /* * If anonymous vma has not yet been faulted, update new pgoff * to match new location, to increase its chance of merging. */ if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) { pgoff = addr >> PAGE_SHIFT; faulted_in_anon_vma = false; } /* * If the VMA we are copying might contain a uprobe PTE, ensure * that we do not establish one upon merge. Otherwise, when mremap() * moves page tables, it will orphan the newly created PTE. */ if (vma->vm_file) vmg.skip_vma_uprobe = true; new_vma = find_vma_prev(mm, addr, &vmg.prev); if (new_vma && new_vma->vm_start < addr + len) return NULL; /* should never get here */ vmg.pgoff = pgoff; vmg.next = vma_iter_next_rewind(&vmi, NULL); new_vma = vma_merge_copied_range(&vmg); if (new_vma) { /* * Source vma may have been merged into new_vma */ if (unlikely(vma_start >= new_vma->vm_start && vma_start < new_vma->vm_end)) { /* * The only way we can get a vma_merge with * self during an mremap is if the vma hasn't * been faulted in yet and we were allowed to * reset the dst vma->vm_pgoff to the * destination address of the mremap to allow * the merge to happen. mremap must change the * vm_pgoff linearity between src and dst vmas * (in turn preventing a vma_merge) to be * safe. It is only safe to keep the vm_pgoff * linear if there are no pages mapped yet. */ VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma); *vmap = vma = new_vma; } *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); } else { new_vma = vm_area_dup(vma); if (!new_vma) goto out; vma_set_range(new_vma, addr, addr + len, pgoff); if (vma_dup_policy(vma, new_vma)) goto out_free_vma; if (anon_vma_clone(new_vma, vma)) goto out_free_mempol; if (new_vma->vm_file) get_file(new_vma->vm_file); if (new_vma->vm_ops && new_vma->vm_ops->open) new_vma->vm_ops->open(new_vma); if (vma_link(mm, new_vma)) goto out_vma_link; *need_rmap_locks = false; } return new_vma; out_vma_link: fixup_hugetlb_reservations(new_vma); vma_close(new_vma); if (new_vma->vm_file) fput(new_vma->vm_file); unlink_anon_vmas(new_vma); out_free_mempol: mpol_put(vma_policy(new_vma)); out_free_vma: vm_area_free(new_vma); out: return NULL; } /* * Rough compatibility check to quickly see if it's even worth looking * at sharing an anon_vma. * * They need to have the same vm_file, and the flags can only differ * in things that mprotect may change. * * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that * we can merge the two vma's. For example, we refuse to merge a vma if * there is a vm_ops->close() function, because that indicates that the * driver is doing some kind of reference counting. But that doesn't * really matter for the anon_vma sharing case. */ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) { return a->vm_end == b->vm_start && mpol_equal(vma_policy(a), vma_policy(b)) && a->vm_file == b->vm_file && !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_IGNORE_MERGE)) && b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); } /* * Do some basic sanity checking to see if we can re-use the anon_vma * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be * the same as 'old', the other will be the new one that is trying * to share the anon_vma. * * NOTE! This runs with mmap_lock held for reading, so it is possible that * the anon_vma of 'old' is concurrently in the process of being set up * by another page fault trying to merge _that_. But that's ok: if it * is being set up, that automatically means that it will be a singleton * acceptable for merging, so we can do all of this optimistically. But * we do that READ_ONCE() to make sure that we never re-load the pointer. * * IOW: that the "list_is_singular()" test on the anon_vma_chain only * matters for the 'stable anon_vma' case (ie the thing we want to avoid * is to return an anon_vma that is "complex" due to having gone through * a fork). * * We also make sure that the two vma's are compatible (adjacent, * and with the same memory policies). That's all stable, even with just * a read lock on the mmap_lock. */ static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) { if (anon_vma_compatible(a, b)) { struct anon_vma *anon_vma = READ_ONCE(old->anon_vma); if (anon_vma && list_is_singular(&old->anon_vma_chain)) return anon_vma; } return NULL; } /* * find_mergeable_anon_vma is used by anon_vma_prepare, to check * neighbouring vmas for a suitable anon_vma, before it goes off * to allocate a new anon_vma. It checks because a repetitive * sequence of mprotects and faults may otherwise lead to distinct * anon_vmas being allocated, preventing vma merge in subsequent * mprotect. */ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) { struct anon_vma *anon_vma = NULL; struct vm_area_struct *prev, *next; VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end); /* Try next first. */ next = vma_iter_load(&vmi); if (next) { anon_vma = reusable_anon_vma(next, vma, next); if (anon_vma) return anon_vma; } prev = vma_prev(&vmi); VM_BUG_ON_VMA(prev != vma, vma); prev = vma_prev(&vmi); /* Try prev next. */ if (prev) anon_vma = reusable_anon_vma(prev, prev, vma); /* * We might reach here with anon_vma == NULL if we can't find * any reusable anon_vma. * There's no absolute need to look only at touching neighbours: * we could search further afield for "compatible" anon_vmas. * But it would probably just be a waste of time searching, * or lead to too many vmas hanging off the same anon_vma. * We're trying to allow mprotect remerging later on, * not trying to minimize memory used for anon_vmas. */ return anon_vma; } static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops) { return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite); } static bool vma_is_shared_writable(struct vm_area_struct *vma) { return (vma->vm_flags & (VM_WRITE | VM_SHARED)) == (VM_WRITE | VM_SHARED); } static bool vma_fs_can_writeback(struct vm_area_struct *vma) { /* No managed pages to writeback. */ if (vma->vm_flags & VM_PFNMAP) return false; return vma->vm_file && vma->vm_file->f_mapping && mapping_can_writeback(vma->vm_file->f_mapping); } /* * Does this VMA require the underlying folios to have their dirty state * tracked? */ bool vma_needs_dirty_tracking(struct vm_area_struct *vma) { /* Only shared, writable VMAs require dirty tracking. */ if (!vma_is_shared_writable(vma)) return false; /* Does the filesystem need to be notified? */ if (vm_ops_needs_writenotify(vma->vm_ops)) return true; /* * Even if the filesystem doesn't indicate a need for writenotify, if it * can writeback, dirty tracking is still required. */ return vma_fs_can_writeback(vma); } /* * Some shared mappings will want the pages marked read-only * to track write events. If so, we'll downgrade vm_page_prot * to the private version (using protection_map[] without the * VM_SHARED bit). */ bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) { /* If it was private or non-writable, the write bit is already clear */ if (!vma_is_shared_writable(vma)) return false; /* The backer wishes to know when pages are first written to? */ if (vm_ops_needs_writenotify(vma->vm_ops)) return true; /* The open routine did something to the protections that pgprot_modify * won't preserve? */ if (pgprot_val(vm_page_prot) != pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags))) return false; /* * Do we need to track softdirty? hugetlb does not support softdirty * tracking yet. */ if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma)) return true; /* Do we need write faults for uffd-wp tracking? */ if (userfaultfd_wp(vma)) return true; /* Can the mapping track the dirty pages? */ return vma_fs_can_writeback(vma); } static DEFINE_MUTEX(mm_all_locks_mutex); static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) { if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { /* * The LSB of head.next can't change from under us * because we hold the mm_all_locks_mutex. */ down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock); /* * We can safely modify head.next after taking the * anon_vma->root->rwsem. If some other vma in this mm shares * the same anon_vma we won't take it again. * * No need of atomic instructions here, head.next * can't change from under us thanks to the * anon_vma->root->rwsem. */ if (__test_and_set_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) BUG(); } } static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) { if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { /* * AS_MM_ALL_LOCKS can't change from under us because * we hold the mm_all_locks_mutex. * * Operations on ->flags have to be atomic because * even if AS_MM_ALL_LOCKS is stable thanks to the * mm_all_locks_mutex, there may be other cpus * changing other bitflags in parallel to us. */ if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) BUG(); down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock); } } /* * This operation locks against the VM for all pte/vma/mm related * operations that could ever happen on a certain mm. This includes * vmtruncate, try_to_unmap, and all page faults. * * The caller must take the mmap_lock in write mode before calling * mm_take_all_locks(). The caller isn't allowed to release the * mmap_lock until mm_drop_all_locks() returns. * * mmap_lock in write mode is required in order to block all operations * that could modify pagetables and free pages without need of * altering the vma layout. It's also needed in write mode to avoid new * anon_vmas to be associated with existing vmas. * * A single task can't take more than one mm_take_all_locks() in a row * or it would deadlock. * * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in * mapping->flags avoid to take the same lock twice, if more than one * vma in this mm is backed by the same anon_vma or address_space. * * We take locks in following order, accordingly to comment at beginning * of mm/rmap.c: * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for * hugetlb mapping); * - all vmas marked locked * - all i_mmap_rwsem locks; * - all anon_vma->rwseml * * We can take all locks within these types randomly because the VM code * doesn't nest them and we protected from parallel mm_take_all_locks() by * mm_all_locks_mutex. * * mm_take_all_locks() and mm_drop_all_locks are expensive operations * that may have to take thousand of locks. * * mm_take_all_locks() can fail if it's interrupted by signals. */ int mm_take_all_locks(struct mm_struct *mm) { struct vm_area_struct *vma; struct anon_vma_chain *avc; VMA_ITERATOR(vmi, mm, 0); mmap_assert_write_locked(mm); mutex_lock(&mm_all_locks_mutex); /* * vma_start_write() does not have a complement in mm_drop_all_locks() * because vma_start_write() is always asymmetrical; it marks a VMA as * being written to until mmap_write_unlock() or mmap_write_downgrade() * is reached. */ for_each_vma(vmi, vma) { if (signal_pending(current)) goto out_unlock; vma_start_write(vma); } vma_iter_init(&vmi, mm, 0); for_each_vma(vmi, vma) { if (signal_pending(current)) goto out_unlock; if (vma->vm_file && vma->vm_file->f_mapping && is_vm_hugetlb_page(vma)) vm_lock_mapping(mm, vma->vm_file->f_mapping); } vma_iter_init(&vmi, mm, 0); for_each_vma(vmi, vma) { if (signal_pending(current)) goto out_unlock; if (vma->vm_file && vma->vm_file->f_mapping && !is_vm_hugetlb_page(vma)) vm_lock_mapping(mm, vma->vm_file->f_mapping); } vma_iter_init(&vmi, mm, 0); for_each_vma(vmi, vma) { if (signal_pending(current)) goto out_unlock; if (vma->anon_vma) list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) vm_lock_anon_vma(mm, avc->anon_vma); } return 0; out_unlock: mm_drop_all_locks(mm); return -EINTR; } static void vm_unlock_anon_vma(struct anon_vma *anon_vma) { if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { /* * The LSB of head.next can't change to 0 from under * us because we hold the mm_all_locks_mutex. * * We must however clear the bitflag before unlocking * the vma so the users using the anon_vma->rb_root will * never see our bitflag. * * No need of atomic instructions here, head.next * can't change from under us until we release the * anon_vma->root->rwsem. */ if (!__test_and_clear_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) BUG(); anon_vma_unlock_write(anon_vma); } } static void vm_unlock_mapping(struct address_space *mapping) { if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { /* * AS_MM_ALL_LOCKS can't change to 0 from under us * because we hold the mm_all_locks_mutex. */ i_mmap_unlock_write(mapping); if (!test_and_clear_bit(AS_MM_ALL_LOCKS, &mapping->flags)) BUG(); } } /* * The mmap_lock cannot be released by the caller until * mm_drop_all_locks() returns. */ void mm_drop_all_locks(struct mm_struct *mm) { struct vm_area_struct *vma; struct anon_vma_chain *avc; VMA_ITERATOR(vmi, mm, 0); mmap_assert_write_locked(mm); BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); for_each_vma(vmi, vma) { if (vma->anon_vma) list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) vm_unlock_anon_vma(avc->anon_vma); if (vma->vm_file && vma->vm_file->f_mapping) vm_unlock_mapping(vma->vm_file->f_mapping); } mutex_unlock(&mm_all_locks_mutex); } /* * We account for memory if it's a private writeable mapping, * not hugepages and VM_NORESERVE wasn't set. */ static bool accountable_mapping(struct file *file, vm_flags_t vm_flags) { /* * hugetlb has its own accounting separate from the core VM * VM_HUGETLB may not be set yet so we cannot check for that flag. */ if (file && is_file_hugepages(file)) return false; return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; } /* * vms_abort_munmap_vmas() - Undo as much as possible from an aborted munmap() * operation. * @vms: The vma unmap structure * @mas_detach: The maple state with the detached maple tree * * Reattach any detached vmas, free up the maple tree used to track the vmas. * If that's not possible because the ptes are cleared (and vm_ops->closed() may * have been called), then a NULL is written over the vmas and the vmas are * removed (munmap() completed). */ static void vms_abort_munmap_vmas(struct vma_munmap_struct *vms, struct ma_state *mas_detach) { struct ma_state *mas = &vms->vmi->mas; if (!vms->nr_pages) return; if (vms->clear_ptes) return reattach_vmas(mas_detach); /* * Aborting cannot just call the vm_ops open() because they are often * not symmetrical and state data has been lost. Resort to the old * failure method of leaving a gap where the MAP_FIXED mapping failed. */ mas_set_range(mas, vms->start, vms->end - 1); mas_store_gfp(mas, NULL, GFP_KERNEL|__GFP_NOFAIL); /* Clean up the insertion of the unfortunate gap */ vms_complete_munmap_vmas(vms, mas_detach); } static void update_ksm_flags(struct mmap_state *map) { map->vm_flags = ksm_vma_flags(map->mm, map->file, map->vm_flags); } static void set_desc_from_map(struct vm_area_desc *desc, const struct mmap_state *map) { desc->start = map->addr; desc->end = map->end; desc->pgoff = map->pgoff; desc->vm_file = map->file; desc->vm_flags = map->vm_flags; desc->page_prot = map->page_prot; } /* * __mmap_setup() - Prepare to gather any overlapping VMAs that need to be * unmapped once the map operation is completed, check limits, account mapping * and clean up any pre-existing VMAs. * * As a result it sets up the @map and @desc objects. * * @map: Mapping state. * @desc: VMA descriptor * @uf: Userfaultfd context list. * * Returns: 0 on success, error code otherwise. */ static int __mmap_setup(struct mmap_state *map, struct vm_area_desc *desc, struct list_head *uf) { int error; struct vma_iterator *vmi = map->vmi; struct vma_munmap_struct *vms = &map->vms; /* Find the first overlapping VMA and initialise unmap state. */ vms->vma = vma_find(vmi, map->end); init_vma_munmap(vms, vmi, vms->vma, map->addr, map->end, uf, /* unlock = */ false); /* OK, we have overlapping VMAs - prepare to unmap them. */ if (vms->vma) { mt_init_flags(&map->mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); mt_on_stack(map->mt_detach); mas_init(&map->mas_detach, &map->mt_detach, /* addr = */ 0); /* Prepare to unmap any existing mapping in the area */ error = vms_gather_munmap_vmas(vms, &map->mas_detach); if (error) { /* On error VMAs will already have been reattached. */ vms->nr_pages = 0; return error; } map->next = vms->next; map->prev = vms->prev; } else { map->next = vma_iter_next_rewind(vmi, &map->prev); } /* Check against address space limit. */ if (!may_expand_vm(map->mm, map->vm_flags, map->pglen - vms->nr_pages)) return -ENOMEM; /* Private writable mapping: check memory availability. */ if (accountable_mapping(map->file, map->vm_flags)) { map->charged = map->pglen; map->charged -= vms->nr_accounted; if (map->charged) { error = security_vm_enough_memory_mm(map->mm, map->charged); if (error) return error; } vms->nr_accounted = 0; map->vm_flags |= VM_ACCOUNT; } /* * Clear PTEs while the vma is still in the tree so that rmap * cannot race with the freeing later in the truncate scenario. * This is also needed for mmap_file(), which is why vm_ops * close function is called. */ vms_clean_up_area(vms, &map->mas_detach); set_desc_from_map(desc, map); return 0; } static int __mmap_new_file_vma(struct mmap_state *map, struct vm_area_struct *vma) { struct vma_iterator *vmi = map->vmi; int error; vma->vm_file = map->file; if (!map->file_doesnt_need_get) get_file(map->file); if (!map->file->f_op->mmap) return 0; error = mmap_file(vma->vm_file, vma); if (error) { fput(vma->vm_file); vma->vm_file = NULL; vma_iter_set(vmi, vma->vm_end); /* Undo any partial mapping done by a device driver. */ unmap_region(&vmi->mas, vma, map->prev, map->next); return error; } /* Drivers cannot alter the address of the VMA. */ WARN_ON_ONCE(map->addr != vma->vm_start); /* * Drivers should not permit writability when previously it was * disallowed. */ VM_WARN_ON_ONCE(map->vm_flags != vma->vm_flags && !(map->vm_flags & VM_MAYWRITE) && (vma->vm_flags & VM_MAYWRITE)); map->file = vma->vm_file; map->vm_flags = vma->vm_flags; return 0; } /* * __mmap_new_vma() - Allocate a new VMA for the region, as merging was not * possible. * * @map: Mapping state. * @vmap: Output pointer for the new VMA. * * Returns: Zero on success, or an error. */ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) { struct vma_iterator *vmi = map->vmi; int error = 0; struct vm_area_struct *vma; /* * Determine the object being mapped and call the appropriate * specific mapper. the address has already been validated, but * not unmapped, but the maps are removed from the list. */ vma = vm_area_alloc(map->mm); if (!vma) return -ENOMEM; vma_iter_config(vmi, map->addr, map->end); vma_set_range(vma, map->addr, map->end, map->pgoff); vm_flags_init(vma, map->vm_flags); vma->vm_page_prot = map->page_prot; if (vma_iter_prealloc(vmi, vma)) { error = -ENOMEM; goto free_vma; } if (map->file) error = __mmap_new_file_vma(map, vma); else if (map->vm_flags & VM_SHARED) error = shmem_zero_setup(vma); else vma_set_anonymous(vma); if (error) goto free_iter_vma; if (!map->check_ksm_early) { update_ksm_flags(map); vm_flags_init(vma, map->vm_flags); } #ifdef CONFIG_SPARC64 /* TODO: Fix SPARC ADI! */ WARN_ON_ONCE(!arch_validate_flags(map->vm_flags)); #endif /* Lock the VMA since it is modified after insertion into VMA tree */ vma_start_write(vma); vma_iter_store_new(vmi, vma); map->mm->map_count++; vma_link_file(vma, map->hold_file_rmap_lock); /* * vma_merge_new_range() calls khugepaged_enter_vma() too, the below * call covers the non-merge case. */ if (!vma_is_anonymous(vma)) khugepaged_enter_vma(vma, map->vm_flags); *vmap = vma; return 0; free_iter_vma: vma_iter_free(vmi); free_vma: vm_area_free(vma); return error; } /* * __mmap_complete() - Unmap any VMAs we overlap, account memory mapping * statistics, handle locking and finalise the VMA. * * @map: Mapping state. * @vma: Merged or newly allocated VMA for the mmap()'d region. */ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) { struct mm_struct *mm = map->mm; vm_flags_t vm_flags = vma->vm_flags; perf_event_mmap(vma); /* Unmap any existing mapping in the area. */ vms_complete_munmap_vmas(&map->vms, &map->mas_detach); vm_stat_account(mm, vma->vm_flags, map->pglen); if (vm_flags & VM_LOCKED) { if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(mm)) vm_flags_clear(vma, VM_LOCKED_MASK); else mm->locked_vm += map->pglen; } if (vma->vm_file) uprobe_mmap(vma); /* * New (or expanded) vma always get soft dirty status. * Otherwise user-space soft-dirty page tracker won't * be able to distinguish situation when vma area unmapped, * then new mapped in-place (which must be aimed as * a completely new data area). */ if (pgtable_supports_soft_dirty()) vm_flags_set(vma, VM_SOFTDIRTY); vma_set_page_prot(vma); } static void call_action_prepare(struct mmap_state *map, struct vm_area_desc *desc) { struct mmap_action *action = &desc->action; mmap_action_prepare(action, desc); if (action->hide_from_rmap_until_complete) map->hold_file_rmap_lock = true; } /* * Invoke the f_op->mmap_prepare() callback for a file-backed mapping that * specifies it. * * This is called prior to any merge attempt, and updates whitelisted fields * that are permitted to be updated by the caller. * * All but user-defined fields will be pre-populated with original values. * * Returns 0 on success, or an error code otherwise. */ static int call_mmap_prepare(struct mmap_state *map, struct vm_area_desc *desc) { int err; /* Invoke the hook. */ err = vfs_mmap_prepare(map->file, desc); if (err) return err; call_action_prepare(map, desc); /* Update fields permitted to be changed. */ map->pgoff = desc->pgoff; if (desc->vm_file != map->file) { map->file_doesnt_need_get = true; map->file = desc->vm_file; } map->vm_flags = desc->vm_flags; map->page_prot = desc->page_prot; /* User-defined fields. */ map->vm_ops = desc->vm_ops; map->vm_private_data = desc->private_data; return 0; } static void set_vma_user_defined_fields(struct vm_area_struct *vma, struct mmap_state *map) { if (map->vm_ops) vma->vm_ops = map->vm_ops; vma->vm_private_data = map->vm_private_data; } /* * Are we guaranteed no driver can change state such as to preclude KSM merging? * If so, let's set the KSM mergeable flag early so we don't break VMA merging. */ static bool can_set_ksm_flags_early(struct mmap_state *map) { struct file *file = map->file; /* Anonymous mappings have no driver which can change them. */ if (!file) return true; /* * If .mmap_prepare() is specified, then the driver will have already * manipulated state prior to updating KSM flags. So no need to worry * about mmap callbacks modifying VMA flags after the KSM flag has been * updated here, which could otherwise affect KSM eligibility. */ if (file->f_op->mmap_prepare) return true; /* shmem is safe. */ if (shmem_file(file)) return true; /* Any other .mmap callback is not safe. */ return false; } static int call_action_complete(struct mmap_state *map, struct vm_area_desc *desc, struct vm_area_struct *vma) { struct mmap_action *action = &desc->action; int ret; ret = mmap_action_complete(action, vma); /* If we held the file rmap we need to release it. */ if (map->hold_file_rmap_lock) { struct file *file = vma->vm_file; i_mmap_unlock_write(file->f_mapping); } return ret; } static unsigned long __mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; bool have_mmap_prepare = file && file->f_op->mmap_prepare; VMA_ITERATOR(vmi, mm, addr); MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file); struct vm_area_desc desc = { .mm = mm, .file = file, .action = { .type = MMAP_NOTHING, /* Default to no further action. */ }, }; bool allocated_new = false; int error; map.check_ksm_early = can_set_ksm_flags_early(&map); error = __mmap_setup(&map, &desc, uf); if (!error && have_mmap_prepare) error = call_mmap_prepare(&map, &desc); if (error) goto abort_munmap; if (map.check_ksm_early) update_ksm_flags(&map); /* Attempt to merge with adjacent VMAs... */ if (map.prev || map.next) { VMG_MMAP_STATE(vmg, &map, /* vma = */ NULL); vma = vma_merge_new_range(&vmg); } /* ...but if we can't, allocate a new VMA. */ if (!vma) { error = __mmap_new_vma(&map, &vma); if (error) goto unacct_error; allocated_new = true; } if (have_mmap_prepare) set_vma_user_defined_fields(vma, &map); __mmap_complete(&map, vma); if (have_mmap_prepare && allocated_new) { error = call_action_complete(&map, &desc, vma); if (error) return error; } return addr; /* Accounting was done by __mmap_setup(). */ unacct_error: if (map.charged) vm_unacct_memory(map.charged); abort_munmap: vms_abort_munmap_vmas(&map.vms, &map.mas_detach); return error; } /** * mmap_region() - Actually perform the userland mapping of a VMA into * current->mm with known, aligned and overflow-checked @addr and @len, and * correctly determined VMA flags @vm_flags and page offset @pgoff. * * This is an internal memory management function, and should not be used * directly. * * The caller must write-lock current->mm->mmap_lock. * * @file: If a file-backed mapping, a pointer to the struct file describing the * file to be mapped, otherwise NULL. * @addr: The page-aligned address at which to perform the mapping. * @len: The page-aligned, non-zero, length of the mapping. * @vm_flags: The VMA flags which should be applied to the mapping. * @pgoff: If @file is specified, the page offset into the file, if not then * the virtual page offset in memory of the anonymous mapping. * @uf: Optionally, a pointer to a list head used for tracking userfaultfd unmap * events. * * Returns: Either an error, or the address at which the requested mapping has * been performed. */ unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf) { unsigned long ret; bool writable_file_mapping = false; mmap_assert_write_locked(current->mm); /* Check to see if MDWE is applicable. */ if (map_deny_write_exec(vm_flags, vm_flags)) return -EACCES; /* Allow architectures to sanity-check the vm_flags. */ if (!arch_validate_flags(vm_flags)) return -EINVAL; /* Map writable and ensure this isn't a sealed memfd. */ if (file && is_shared_maywrite(vm_flags)) { int error = mapping_map_writable(file->f_mapping); if (error) return error; writable_file_mapping = true; } ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf); /* Clear our write mapping regardless of error. */ if (writable_file_mapping) mapping_unmap_writable(file->f_mapping); validate_mm(current->mm); return ret; } /* * do_brk_flags() - Increase the brk vma if the flags match. * @vmi: The vma iterator * @addr: The start address * @len: The length of the increase * @vma: The vma, * @vm_flags: The VMA Flags * * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags * do not match then create a new anonymous VMA. Eventually we may be able to * do some brk-specific accounting here. */ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, unsigned long len, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; /* * Check against address space limits by the changed size * Note: This happens *after* clearing old mappings in some code paths. */ vm_flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; vm_flags = ksm_vma_flags(mm, NULL, vm_flags); if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) return -ENOMEM; if (mm->map_count > sysctl_max_map_count) return -ENOMEM; if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) return -ENOMEM; /* * Expand the existing vma if possible; Note that singular lists do not * occur after forking, so the expand will only happen on new VMAs. */ if (vma && vma->vm_end == addr) { VMG_STATE(vmg, mm, vmi, addr, addr + len, vm_flags, PHYS_PFN(addr)); vmg.prev = vma; /* vmi is positioned at prev, which this mode expects. */ vmg.just_expand = true; if (vma_merge_new_range(&vmg)) goto out; else if (vmg_nomem(&vmg)) goto unacct_fail; } if (vma) vma_iter_next_range(vmi); /* create a vma struct for an anonymous mapping */ vma = vm_area_alloc(mm); if (!vma) goto unacct_fail; vma_set_anonymous(vma); vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT); vm_flags_init(vma, vm_flags); vma->vm_page_prot = vm_get_page_prot(vm_flags); vma_start_write(vma); if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL)) goto mas_store_fail; mm->map_count++; validate_mm(mm); out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; mm->data_vm += len >> PAGE_SHIFT; if (vm_flags & VM_LOCKED) mm->locked_vm += (len >> PAGE_SHIFT); if (pgtable_supports_soft_dirty()) vm_flags_set(vma, VM_SOFTDIRTY); return 0; mas_store_fail: vm_area_free(vma); unacct_fail: vm_unacct_memory(len >> PAGE_SHIFT); return -ENOMEM; } /** * unmapped_area() - Find an area between the low_limit and the high_limit with * the correct alignment and offset, all from @info. Note: current->mm is used * for the search. * * @info: The unmapped area information including the range [low_limit - * high_limit), the alignment offset and mask. * * Return: A memory address or -ENOMEM. */ unsigned long unmapped_area(struct vm_unmapped_area_info *info) { unsigned long length, gap; unsigned long low_limit, high_limit; struct vm_area_struct *tmp; VMA_ITERATOR(vmi, current->mm, 0); /* Adjust search length to account for worst case alignment overhead */ length = info->length + info->align_mask + info->start_gap; if (length < info->length) return -ENOMEM; low_limit = info->low_limit; if (low_limit < mmap_min_addr) low_limit = mmap_min_addr; high_limit = info->high_limit; retry: if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length)) return -ENOMEM; /* * Adjust for the gap first so it doesn't interfere with the * later alignment. The first step is the minimum needed to * fulill the start gap, the next steps is the minimum to align * that. It is the minimum needed to fulill both. */ gap = vma_iter_addr(&vmi) + info->start_gap; gap += (info->align_offset - gap) & info->align_mask; tmp = vma_next(&vmi); if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ if (vm_start_gap(tmp) < gap + length - 1) { low_limit = tmp->vm_end; vma_iter_reset(&vmi); goto retry; } } else { tmp = vma_prev(&vmi); if (tmp && vm_end_gap(tmp) > gap) { low_limit = vm_end_gap(tmp); vma_iter_reset(&vmi); goto retry; } } return gap; } /** * unmapped_area_topdown() - Find an area between the low_limit and the * high_limit with the correct alignment and offset at the highest available * address, all from @info. Note: current->mm is used for the search. * * @info: The unmapped area information including the range [low_limit - * high_limit), the alignment offset and mask. * * Return: A memory address or -ENOMEM. */ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) { unsigned long length, gap, gap_end; unsigned long low_limit, high_limit; struct vm_area_struct *tmp; VMA_ITERATOR(vmi, current->mm, 0); /* Adjust search length to account for worst case alignment overhead */ length = info->length + info->align_mask + info->start_gap; if (length < info->length) return -ENOMEM; low_limit = info->low_limit; if (low_limit < mmap_min_addr) low_limit = mmap_min_addr; high_limit = info->high_limit; retry: if (vma_iter_area_highest(&vmi, low_limit, high_limit, length)) return -ENOMEM; gap = vma_iter_end(&vmi) - info->length; gap -= (gap - info->align_offset) & info->align_mask; gap_end = vma_iter_end(&vmi); tmp = vma_next(&vmi); if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ if (vm_start_gap(tmp) < gap_end) { high_limit = vm_start_gap(tmp); vma_iter_reset(&vmi); goto retry; } } else { tmp = vma_prev(&vmi); if (tmp && vm_end_gap(tmp) > gap) { high_limit = tmp->vm_start; vma_iter_reset(&vmi); goto retry; } } return gap; } /* * Verify that the stack growth is acceptable and * update accounting. This is shared with both the * grow-up and grow-down cases. */ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow) { struct mm_struct *mm = vma->vm_mm; unsigned long new_start; /* address space limit tests */ if (!may_expand_vm(mm, vma->vm_flags, grow)) return -ENOMEM; /* Stack limit test */ if (size > rlimit(RLIMIT_STACK)) return -ENOMEM; /* mlock limit tests */ if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT)) return -ENOMEM; /* Check to ensure the stack will not grow into a hugetlb-only region */ new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : vma->vm_end - size; if (is_hugepage_only_range(vma->vm_mm, new_start, size)) return -EFAULT; /* * Overcommit.. This must be the final test, as it will * update security statistics. */ if (security_vm_enough_memory_mm(mm, grow)) return -ENOMEM; return 0; } #if defined(CONFIG_STACK_GROWSUP) /* * PA-RISC uses this for its stack. * vma is the last one with address > vma->vm_end. Have to extend vma. */ int expand_upwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *next; unsigned long gap_addr; int error = 0; VMA_ITERATOR(vmi, mm, vma->vm_start); if (!(vma->vm_flags & VM_GROWSUP)) return -EFAULT; mmap_assert_write_locked(mm); /* Guard against exceeding limits of the address space. */ address &= PAGE_MASK; if (address >= (TASK_SIZE & PAGE_MASK)) return -ENOMEM; address += PAGE_SIZE; /* Enforce stack_guard_gap */ gap_addr = address + stack_guard_gap; /* Guard against overflow */ if (gap_addr < address || gap_addr > TASK_SIZE) gap_addr = TASK_SIZE; next = find_vma_intersection(mm, vma->vm_end, gap_addr); if (next && vma_is_accessible(next)) { if (!(next->vm_flags & VM_GROWSUP)) return -ENOMEM; /* Check that both stack segments have the same anon_vma? */ } if (next) vma_iter_prev_range_limit(&vmi, address); vma_iter_config(&vmi, vma->vm_start, address); if (vma_iter_prealloc(&vmi, vma)) return -ENOMEM; /* We must make sure the anon_vma is allocated. */ if (unlikely(anon_vma_prepare(vma))) { vma_iter_free(&vmi); return -ENOMEM; } /* Lock the VMA before expanding to prevent concurrent page faults */ vma_start_write(vma); /* We update the anon VMA tree. */ anon_vma_lock_write(vma->anon_vma); /* Somebody else might have raced and expanded it already */ if (address > vma->vm_end) { unsigned long size, grow; size = address - vma->vm_start; grow = (address - vma->vm_end) >> PAGE_SHIFT; error = -ENOMEM; if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { error = acct_stack_growth(vma, size, grow); if (!error) { if (vma->vm_flags & VM_LOCKED) mm->locked_vm += grow; vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; /* Overwrite old entry in mtree. */ vma_iter_store_overwrite(&vmi, vma); anon_vma_interval_tree_post_update_vma(vma); perf_event_mmap(vma); } } } anon_vma_unlock_write(vma->anon_vma); vma_iter_free(&vmi); validate_mm(mm); return error; } #endif /* CONFIG_STACK_GROWSUP */ /* * vma is the first one with address < vma->vm_start. Have to extend vma. * mmap_lock held for writing. */ int expand_downwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *prev; int error = 0; VMA_ITERATOR(vmi, mm, vma->vm_start); if (!(vma->vm_flags & VM_GROWSDOWN)) return -EFAULT; mmap_assert_write_locked(mm); address &= PAGE_MASK; if (address < mmap_min_addr || address < FIRST_USER_ADDRESS) return -EPERM; /* Enforce stack_guard_gap */ prev = vma_prev(&vmi); /* Check that both stack segments have the same anon_vma? */ if (prev) { if (!(prev->vm_flags & VM_GROWSDOWN) && vma_is_accessible(prev) && (address - prev->vm_end < stack_guard_gap)) return -ENOMEM; } if (prev) vma_iter_next_range_limit(&vmi, vma->vm_start); vma_iter_config(&vmi, address, vma->vm_end); if (vma_iter_prealloc(&vmi, vma)) return -ENOMEM; /* We must make sure the anon_vma is allocated. */ if (unlikely(anon_vma_prepare(vma))) { vma_iter_free(&vmi); return -ENOMEM; } /* Lock the VMA before expanding to prevent concurrent page faults */ vma_start_write(vma); /* We update the anon VMA tree. */ anon_vma_lock_write(vma->anon_vma); /* Somebody else might have raced and expanded it already */ if (address < vma->vm_start) { unsigned long size, grow; size = vma->vm_end - address; grow = (vma->vm_start - address) >> PAGE_SHIFT; error = -ENOMEM; if (grow <= vma->vm_pgoff) { error = acct_stack_growth(vma, size, grow); if (!error) { if (vma->vm_flags & VM_LOCKED) mm->locked_vm += grow; vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_start = address; vma->vm_pgoff -= grow; /* Overwrite old entry in mtree. */ vma_iter_store_overwrite(&vmi, vma); anon_vma_interval_tree_post_update_vma(vma); perf_event_mmap(vma); } } } anon_vma_unlock_write(vma->anon_vma); vma_iter_free(&vmi); validate_mm(mm); return error; } int __vm_munmap(unsigned long start, size_t len, bool unlock) { int ret; struct mm_struct *mm = current->mm; LIST_HEAD(uf); VMA_ITERATOR(vmi, mm, start); if (mmap_write_lock_killable(mm)) return -EINTR; ret = do_vmi_munmap(&vmi, mm, start, len, &uf, unlock); if (ret || !unlock) mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); return ret; } /* Insert vm structure into process list sorted by address * and into the inode's i_mmap tree. If vm_file is non-NULL * then i_mmap_rwsem is taken here. */ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { unsigned long charged = vma_pages(vma); if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) return -ENOMEM; if ((vma->vm_flags & VM_ACCOUNT) && security_vm_enough_memory_mm(mm, charged)) return -ENOMEM; /* * The vm_pgoff of a purely anonymous vma should be irrelevant * until its first write fault, when page's anon_vma and index * are set. But now set the vm_pgoff it will almost certainly * end up with (unless mremap moves it elsewhere before that * first wfault), so /proc/pid/maps tells a consistent story. * * By setting it to reflect the virtual start address of the * vma, merges and splits can happen in a seamless way, just * using the existing file pgoff checks and manipulations. * Similarly in do_mmap and in do_brk_flags. */ if (vma_is_anonymous(vma)) { BUG_ON(vma->anon_vma); vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; } if (vma_link(mm, vma)) { if (vma->vm_flags & VM_ACCOUNT) vm_unacct_memory(charged); return -ENOMEM; } return 0; } |
| 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 | // SPDX-License-Identifier: GPL-2.0-or-later /* * TerraTec Cinergy T2/qanu USB2 DVB-T adapter. * * Copyright (C) 2007 Tomi Orava (tomimo@ncircle.nullnet.fi) * * Based on the dvb-usb-framework code and the * original Terratec Cinergy T2 driver by: * * Copyright (C) 2004 Daniel Mack <daniel@qanu.de> and * Holger Waechtler <holger@qanu.de> * * Protocol Spec published on http://qanu.de/specs/terratec_cinergyT2.pdf */ #include "cinergyT2.h" /* * convert linux-dvb frontend parameter set into TPS. * See ETSI ETS-300744, section 4.6.2, table 9 for details. * * This function is probably reusable and may better get placed in a support * library. * * We replace erroneous fields by default TPS fields (the ones with value 0). */ static uint16_t compute_tps(struct dtv_frontend_properties *op) { uint16_t tps = 0; switch (op->code_rate_HP) { case FEC_2_3: tps |= (1 << 7); break; case FEC_3_4: tps |= (2 << 7); break; case FEC_5_6: tps |= (3 << 7); break; case FEC_7_8: tps |= (4 << 7); break; case FEC_1_2: case FEC_AUTO: default: /* tps |= (0 << 7) */; } switch (op->code_rate_LP) { case FEC_2_3: tps |= (1 << 4); break; case FEC_3_4: tps |= (2 << 4); break; case FEC_5_6: tps |= (3 << 4); break; case FEC_7_8: tps |= (4 << 4); break; case FEC_1_2: case FEC_AUTO: default: /* tps |= (0 << 4) */; } switch (op->modulation) { case QAM_16: tps |= (1 << 13); break; case QAM_64: tps |= (2 << 13); break; case QPSK: default: /* tps |= (0 << 13) */; } switch (op->transmission_mode) { case TRANSMISSION_MODE_8K: tps |= (1 << 0); break; case TRANSMISSION_MODE_2K: default: /* tps |= (0 << 0) */; } switch (op->guard_interval) { case GUARD_INTERVAL_1_16: tps |= (1 << 2); break; case GUARD_INTERVAL_1_8: tps |= (2 << 2); break; case GUARD_INTERVAL_1_4: tps |= (3 << 2); break; case GUARD_INTERVAL_1_32: default: /* tps |= (0 << 2) */; } switch (op->hierarchy) { case HIERARCHY_1: tps |= (1 << 10); break; case HIERARCHY_2: tps |= (2 << 10); break; case HIERARCHY_4: tps |= (3 << 10); break; case HIERARCHY_NONE: default: /* tps |= (0 << 10) */; } return tps; } struct cinergyt2_fe_state { struct dvb_frontend fe; struct dvb_usb_device *d; unsigned char data[64]; struct mutex data_mutex; struct dvbt_get_status_msg status; }; static int cinergyt2_fe_read_status(struct dvb_frontend *fe, enum fe_status *status) { struct cinergyt2_fe_state *state = fe->demodulator_priv; int ret; mutex_lock(&state->data_mutex); state->data[0] = CINERGYT2_EP1_GET_TUNER_STATUS; ret = dvb_usb_generic_rw(state->d, state->data, 1, state->data, sizeof(state->status), 0); if (!ret) memcpy(&state->status, state->data, sizeof(state->status)); mutex_unlock(&state->data_mutex); if (ret < 0) return ret; *status = 0; if (0xffff - le16_to_cpu(state->status.gain) > 30) *status |= FE_HAS_SIGNAL; if (state->status.lock_bits & (1 << 6)) *status |= FE_HAS_LOCK; if (state->status.lock_bits & (1 << 5)) *status |= FE_HAS_SYNC; if (state->status.lock_bits & (1 << 4)) *status |= FE_HAS_CARRIER; if (state->status.lock_bits & (1 << 1)) *status |= FE_HAS_VITERBI; if ((*status & (FE_HAS_CARRIER | FE_HAS_VITERBI | FE_HAS_SYNC)) != (FE_HAS_CARRIER | FE_HAS_VITERBI | FE_HAS_SYNC)) *status &= ~FE_HAS_LOCK; return 0; } static int cinergyt2_fe_read_ber(struct dvb_frontend *fe, u32 *ber) { struct cinergyt2_fe_state *state = fe->demodulator_priv; *ber = le32_to_cpu(state->status.viterbi_error_rate); return 0; } static int cinergyt2_fe_read_unc_blocks(struct dvb_frontend *fe, u32 *unc) { struct cinergyt2_fe_state *state = fe->demodulator_priv; *unc = le32_to_cpu(state->status.uncorrected_block_count); return 0; } static int cinergyt2_fe_read_signal_strength(struct dvb_frontend *fe, u16 *strength) { struct cinergyt2_fe_state *state = fe->demodulator_priv; *strength = (0xffff - le16_to_cpu(state->status.gain)); return 0; } static int cinergyt2_fe_read_snr(struct dvb_frontend *fe, u16 *snr) { struct cinergyt2_fe_state *state = fe->demodulator_priv; *snr = (state->status.snr << 8) | state->status.snr; return 0; } static int cinergyt2_fe_init(struct dvb_frontend *fe) { return 0; } static int cinergyt2_fe_sleep(struct dvb_frontend *fe) { deb_info("cinergyt2_fe_sleep() Called\n"); return 0; } static int cinergyt2_fe_get_tune_settings(struct dvb_frontend *fe, struct dvb_frontend_tune_settings *tune) { tune->min_delay_ms = 800; return 0; } static int cinergyt2_fe_set_frontend(struct dvb_frontend *fe) { struct dtv_frontend_properties *fep = &fe->dtv_property_cache; struct cinergyt2_fe_state *state = fe->demodulator_priv; struct dvbt_set_parameters_msg *param; int err; mutex_lock(&state->data_mutex); param = (void *)state->data; param->cmd = CINERGYT2_EP1_SET_TUNER_PARAMETERS; param->tps = cpu_to_le16(compute_tps(fep)); param->freq = cpu_to_le32(fep->frequency / 1000); param->flags = 0; switch (fep->bandwidth_hz) { default: case 8000000: param->bandwidth = 8; break; case 7000000: param->bandwidth = 7; break; case 6000000: param->bandwidth = 6; break; } err = dvb_usb_generic_rw(state->d, state->data, sizeof(*param), state->data, 2, 0); if (err < 0) err("cinergyt2_fe_set_frontend() Failed! err=%d\n", err); mutex_unlock(&state->data_mutex); return (err < 0) ? err : 0; } static void cinergyt2_fe_release(struct dvb_frontend *fe) { struct cinergyt2_fe_state *state = fe->demodulator_priv; kfree(state); } static const struct dvb_frontend_ops cinergyt2_fe_ops; struct dvb_frontend *cinergyt2_fe_attach(struct dvb_usb_device *d) { struct cinergyt2_fe_state *s = kzalloc(sizeof( struct cinergyt2_fe_state), GFP_KERNEL); if (s == NULL) return NULL; s->d = d; memcpy(&s->fe.ops, &cinergyt2_fe_ops, sizeof(struct dvb_frontend_ops)); s->fe.demodulator_priv = s; mutex_init(&s->data_mutex); return &s->fe; } static const struct dvb_frontend_ops cinergyt2_fe_ops = { .delsys = { SYS_DVBT }, .info = { .name = DRIVER_NAME, .frequency_min_hz = 174 * MHz, .frequency_max_hz = 862 * MHz, .frequency_stepsize_hz = 166667, .caps = FE_CAN_INVERSION_AUTO | FE_CAN_FEC_1_2 | FE_CAN_FEC_2_3 | FE_CAN_FEC_3_4 | FE_CAN_FEC_5_6 | FE_CAN_FEC_7_8 | FE_CAN_FEC_AUTO | FE_CAN_QPSK | FE_CAN_QAM_16 | FE_CAN_QAM_64 | FE_CAN_QAM_AUTO | FE_CAN_TRANSMISSION_MODE_AUTO | FE_CAN_GUARD_INTERVAL_AUTO | FE_CAN_HIERARCHY_AUTO | FE_CAN_RECOVER | FE_CAN_MUTE_TS }, .release = cinergyt2_fe_release, .init = cinergyt2_fe_init, .sleep = cinergyt2_fe_sleep, .set_frontend = cinergyt2_fe_set_frontend, .get_tune_settings = cinergyt2_fe_get_tune_settings, .read_status = cinergyt2_fe_read_status, .read_ber = cinergyt2_fe_read_ber, .read_signal_strength = cinergyt2_fe_read_signal_strength, .read_snr = cinergyt2_fe_read_snr, .read_ucblocks = cinergyt2_fe_read_unc_blocks, }; |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef LWQ_H #define LWQ_H /* * Light-weight single-linked queue built from llist * * Entries can be enqueued from any context with no locking. * Entries can be dequeued from process context with integrated locking. * * This is particularly suitable when work items are queued in * BH or IRQ context, and where work items are handled one at a time * by dedicated threads. */ #include <linux/container_of.h> #include <linux/spinlock.h> #include <linux/llist.h> struct lwq_node { struct llist_node node; }; struct lwq { spinlock_t lock; struct llist_node *ready; /* entries to be dequeued */ struct llist_head new; /* entries being enqueued */ }; /** * lwq_init - initialise a lwq * @q: the lwq object */ static inline void lwq_init(struct lwq *q) { spin_lock_init(&q->lock); q->ready = NULL; init_llist_head(&q->new); } /** * lwq_empty - test if lwq contains any entry * @q: the lwq object * * This empty test contains an acquire barrier so that if a wakeup * is sent when lwq_dequeue returns true, it is safe to go to sleep after * a test on lwq_empty(). */ static inline bool lwq_empty(struct lwq *q) { /* acquire ensures ordering wrt lwq_enqueue() */ return smp_load_acquire(&q->ready) == NULL && llist_empty(&q->new); } struct llist_node *__lwq_dequeue(struct lwq *q); /** * lwq_dequeue - dequeue first (oldest) entry from lwq * @q: the queue to dequeue from * @type: the type of object to return * @member: them member in returned object which is an lwq_node. * * Remove a single object from the lwq and return it. This will take * a spinlock and so must always be called in the same context, typcially * process contet. */ #define lwq_dequeue(q, type, member) \ ({ struct llist_node *_n = __lwq_dequeue(q); \ _n ? container_of(_n, type, member.node) : NULL; }) struct llist_node *lwq_dequeue_all(struct lwq *q); /** * lwq_for_each_safe - iterate over detached queue allowing deletion * @_n: iterator variable * @_t1: temporary struct llist_node ** * @_t2: temporary struct llist_node * * @_l: address of llist_node pointer from lwq_dequeue_all() * @_member: member in _n where lwq_node is found. * * Iterate over members in a dequeued list. If the iterator variable * is set to NULL, the iterator removes that entry from the queue. */ #define lwq_for_each_safe(_n, _t1, _t2, _l, _member) \ for (_t1 = (_l); \ *(_t1) ? (_n = container_of(*(_t1), typeof(*(_n)), _member.node),\ _t2 = ((*_t1)->next), \ true) \ : false; \ (_n) ? (_t1 = &(_n)->_member.node.next, 0) \ : ((*(_t1) = (_t2)), 0)) /** * lwq_enqueue - add a new item to the end of the queue * @n - the lwq_node embedded in the item to be added * @q - the lwq to append to. * * No locking is needed to append to the queue so this can * be called from any context. * Return %true is the list may have previously been empty. */ static inline bool lwq_enqueue(struct lwq_node *n, struct lwq *q) { /* acquire enqures ordering wrt lwq_dequeue */ return llist_add(&n->node, &q->new) && smp_load_acquire(&q->ready) == NULL; } /** * lwq_enqueue_batch - add a list of new items to the end of the queue * @n - the lwq_node embedded in the first item to be added * @q - the lwq to append to. * * No locking is needed to append to the queue so this can * be called from any context. * Return %true is the list may have previously been empty. */ static inline bool lwq_enqueue_batch(struct llist_node *n, struct lwq *q) { struct llist_node *e = n; /* acquire enqures ordering wrt lwq_dequeue */ return llist_add_batch(llist_reverse_order(n), e, &q->new) && smp_load_acquire(&q->ready) == NULL; } #endif /* LWQ_H */ |
| 16 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved * * DMA operations that map physical memory through IOMMU. */ #ifndef _LINUX_IOMMU_DMA_H #define _LINUX_IOMMU_DMA_H #include <linux/dma-direction.h> #ifdef CONFIG_IOMMU_DMA static inline bool use_dma_iommu(struct device *dev) { return dev->dma_iommu; } #else static inline bool use_dma_iommu(struct device *dev) { return false; } #endif /* CONFIG_IOMMU_DMA */ dma_addr_t iommu_dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, enum dma_data_direction dir, unsigned long attrs); void iommu_dma_unmap_phys(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir, unsigned long attrs); int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs); void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs); void *iommu_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp, unsigned long attrs); int iommu_dma_mmap(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs); int iommu_dma_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs); unsigned long iommu_dma_get_merge_boundary(struct device *dev); size_t iommu_dma_opt_mapping_size(void); size_t iommu_dma_max_mapping_size(struct device *dev); void iommu_dma_free(struct device *dev, size_t size, void *cpu_addr, dma_addr_t handle, unsigned long attrs); struct sg_table *iommu_dma_alloc_noncontiguous(struct device *dev, size_t size, enum dma_data_direction dir, gfp_t gfp, unsigned long attrs); void iommu_dma_free_noncontiguous(struct device *dev, size_t size, struct sg_table *sgt, enum dma_data_direction dir); void *iommu_dma_vmap_noncontiguous(struct device *dev, size_t size, struct sg_table *sgt); #define iommu_dma_vunmap_noncontiguous(dev, vaddr) \ vunmap(vaddr); int iommu_dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma, size_t size, struct sg_table *sgt); void iommu_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir); void iommu_dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir); void iommu_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, int nelems, enum dma_data_direction dir); void iommu_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, int nelems, enum dma_data_direction dir); #endif /* _LINUX_IOMMU_DMA_H */ |
| 1102 1099 1100 1099 50 379 2 2 2 373 373 372 373 373 371 373 1 2 2 2 2 1 2 80 78 2 2 2 2 2 80 4625 2738 4621 3434 6897 6900 19 3298 4 378 375 5 378 378 378 378 378 372 371 371 370 372 378 377 375 378 376 377 377 51 51 51 50 50 50 50 78 51 51 50 50 78 59 59 48 59 1 6900 6897 6900 1 6894 6910 6899 6908 6897 6842 2 6837 6837 6849 6836 6903 68 6852 3308 3298 3301 3280 3301 1789 6356 6364 6358 6359 6354 6357 6357 6365 1815 1813 1806 1795 1790 1794 42 42 721 2 3 3 1 3 720 723 706 705 708 658 196 721 725 724 723 722 20 20 20 706 3 723 724 20 727 1069 483 484 483 481 481 481 3 3 3 3 34858 34869 34632 34591 34589 34602 4 35050 35060 35061 2735 2682 60 110 108 109 5 5 5 5 5 5 33473 422 29926 7369 33178 32989 33253 656 4603 1300 20 10 4655 4595 4611 1286 4621 1229 6 6 6 34 34 34 78 77 77 77 77 77 76 77 1 7 7 5 5 7 26 26 26 26 24 13 22 7 2 7 5 5 76 74 73 74 74 73 76 7 60 28 14 8 14 14 14 16 20 16 15 1 9 8 2 7 7 6 6 6 6 6 6 4 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/file.c * * Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes * * Manage the dynamic fd arrays in the process files_struct. */ #include <linux/syscalls.h> #include <linux/export.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/bitops.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/close_range.h> #include <linux/file_ref.h> #include <net/sock.h> #include <linux/init_task.h> #include "internal.h" static noinline bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt) { /* * If the reference count was already in the dead zone, then this * put() operation is imbalanced. Warn, put the reference count back to * DEAD and tell the caller to not deconstruct the object. */ if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) { atomic_long_set(&ref->refcnt, FILE_REF_DEAD); return false; } /* * This is a put() operation on a saturated refcount. Restore the * mean saturation value and tell the caller to not deconstruct the * object. */ if (cnt > FILE_REF_MAXREF) atomic_long_set(&ref->refcnt, FILE_REF_SATURATED); return false; } /** * __file_ref_put - Slowpath of file_ref_put() * @ref: Pointer to the reference count * @cnt: Current reference count * * Invoked when the reference count is outside of the valid zone. * * Return: * True if this was the last reference with no future references * possible. This signals the caller that it can safely schedule the * object, which is protected by the reference counter, for * deconstruction. * * False if there are still active references or the put() raced * with a concurrent get()/put() pair. Caller is not allowed to * deconstruct the protected object. */ bool __file_ref_put(file_ref_t *ref, unsigned long cnt) { /* Did this drop the last reference? */ if (likely(cnt == FILE_REF_NOREF)) { /* * Carefully try to set the reference count to FILE_REF_DEAD. * * This can fail if a concurrent get() operation has * elevated it again or the corresponding put() even marked * it dead already. Both are valid situations and do not * require a retry. If this fails the caller is not * allowed to deconstruct the object. */ if (!atomic_long_try_cmpxchg_release(&ref->refcnt, &cnt, FILE_REF_DEAD)) return false; /* * The caller can safely schedule the object for * deconstruction. Provide acquire ordering. */ smp_acquire__after_ctrl_dep(); return true; } return __file_ref_put_badval(ref, cnt); } EXPORT_SYMBOL_GPL(__file_ref_put); unsigned int sysctl_nr_open __read_mostly = 1024*1024; unsigned int sysctl_nr_open_min = BITS_PER_LONG; /* our min() is unusable in constant expressions ;-/ */ #define __const_min(x, y) ((x) < (y) ? (x) : (y)) unsigned int sysctl_nr_open_max = __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG; static void __free_fdtable(struct fdtable *fdt) { kvfree(fdt->fd); kvfree(fdt->open_fds); kfree(fdt); } static void free_fdtable_rcu(struct rcu_head *rcu) { __free_fdtable(container_of(rcu, struct fdtable, rcu)); } #define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr)) #define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long)) #define fdt_words(fdt) ((fdt)->max_fds / BITS_PER_LONG) // words in ->open_fds /* * Copy 'count' fd bits from the old table to the new table and clear the extra * space if any. This does not copy the file pointers. Called with the files * spinlock held for write. */ static inline void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt, unsigned int copy_words) { unsigned int nwords = fdt_words(nfdt); bitmap_copy_and_extend(nfdt->open_fds, ofdt->open_fds, copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG); bitmap_copy_and_extend(nfdt->close_on_exec, ofdt->close_on_exec, copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG); bitmap_copy_and_extend(nfdt->full_fds_bits, ofdt->full_fds_bits, copy_words, nwords); } /* * Copy all file descriptors from the old table to the new, expanded table and * clear the extra space. Called with the files spinlock held for write. */ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) { size_t cpy, set; BUG_ON(nfdt->max_fds < ofdt->max_fds); cpy = ofdt->max_fds * sizeof(struct file *); set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *); memcpy(nfdt->fd, ofdt->fd, cpy); memset((char *)nfdt->fd + cpy, 0, set); copy_fd_bitmaps(nfdt, ofdt, fdt_words(ofdt)); } /* * Note how the fdtable bitmap allocations very much have to be a multiple of * BITS_PER_LONG. This is not only because we walk those things in chunks of * 'unsigned long' in some places, but simply because that is how the Linux * kernel bitmaps are defined to work: they are not "bits in an array of bytes", * they are very much "bits in an array of unsigned long". */ static struct fdtable *alloc_fdtable(unsigned int slots_wanted) { struct fdtable *fdt; unsigned int nr; void *data; /* * Figure out how many fds we actually want to support in this fdtable. * Allocation steps are keyed to the size of the fdarray, since it * grows far faster than any of the other dynamic data. We try to fit * the fdarray into comfortable page-tuned chunks: starting at 1024B * and growing in powers of two from there on. Since we called only * with slots_wanted > BITS_PER_LONG (embedded instance in files->fdtab * already gives BITS_PER_LONG slots), the above boils down to * 1. use the smallest power of two large enough to give us that many * slots. * 2. on 32bit skip 64 and 128 - the minimal capacity we want there is * 256 slots (i.e. 1Kb fd array). * 3. on 64bit don't skip anything, 1Kb fd array means 128 slots there * and we are never going to be asked for 64 or less. */ if (IS_ENABLED(CONFIG_32BIT) && slots_wanted < 256) nr = 256; else nr = roundup_pow_of_two(slots_wanted); /* * Note that this can drive nr *below* what we had passed if sysctl_nr_open * had been set lower between the check in expand_files() and here. * * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise * bitmaps handling below becomes unpleasant, to put it mildly... */ if (unlikely(nr > sysctl_nr_open)) { nr = round_down(sysctl_nr_open, BITS_PER_LONG); if (nr < slots_wanted) return ERR_PTR(-EMFILE); } /* * Check if the allocation size would exceed INT_MAX. kvmalloc_array() * and kvmalloc() will warn if the allocation size is greater than * INT_MAX, as filp_cachep objects are not __GFP_NOWARN. * * This can happen when sysctl_nr_open is set to a very high value and * a process tries to use a file descriptor near that limit. For example, * if sysctl_nr_open is set to 1073741816 (0x3ffffff8) - which is what * systemd typically sets it to - then trying to use a file descriptor * close to that value will require allocating a file descriptor table * that exceeds 8GB in size. */ if (unlikely(nr > INT_MAX / sizeof(struct file *))) return ERR_PTR(-EMFILE); fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT); if (!fdt) goto out; fdt->max_fds = nr; data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT); if (!data) goto out_fdt; fdt->fd = data; data = kvmalloc(max_t(size_t, 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES), GFP_KERNEL_ACCOUNT); if (!data) goto out_arr; fdt->open_fds = data; data += nr / BITS_PER_BYTE; fdt->close_on_exec = data; data += nr / BITS_PER_BYTE; fdt->full_fds_bits = data; return fdt; out_arr: kvfree(fdt->fd); out_fdt: kfree(fdt); out: return ERR_PTR(-ENOMEM); } /* * Expand the file descriptor table. * This function will allocate a new fdtable and both fd array and fdset, of * the given size. * Return <0 error code on error; 0 on successful completion. * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_fdtable(struct files_struct *files, unsigned int nr) __releases(files->file_lock) __acquires(files->file_lock) { struct fdtable *new_fdt, *cur_fdt; spin_unlock(&files->file_lock); new_fdt = alloc_fdtable(nr + 1); /* make sure all fd_install() have seen resize_in_progress * or have finished their rcu_read_lock_sched() section. */ if (atomic_read(&files->count) > 1) synchronize_rcu(); spin_lock(&files->file_lock); if (IS_ERR(new_fdt)) return PTR_ERR(new_fdt); cur_fdt = files_fdtable(files); BUG_ON(nr < cur_fdt->max_fds); copy_fdtable(new_fdt, cur_fdt); rcu_assign_pointer(files->fdt, new_fdt); if (cur_fdt != &files->fdtab) call_rcu(&cur_fdt->rcu, free_fdtable_rcu); /* coupled with smp_rmb() in fd_install() */ smp_wmb(); return 0; } /* * Expand files. * This function will expand the file structures, if the requested size exceeds * the current capacity and there is room for expansion. * Return <0 error code on error; 0 on success. * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_files(struct files_struct *files, unsigned int nr) __releases(files->file_lock) __acquires(files->file_lock) { struct fdtable *fdt; int error; repeat: fdt = files_fdtable(files); /* Do we need to expand? */ if (nr < fdt->max_fds) return 0; if (unlikely(files->resize_in_progress)) { spin_unlock(&files->file_lock); wait_event(files->resize_wait, !files->resize_in_progress); spin_lock(&files->file_lock); goto repeat; } /* Can we expand? */ if (unlikely(nr >= sysctl_nr_open)) return -EMFILE; /* All good, so we try */ files->resize_in_progress = true; error = expand_fdtable(files, nr); files->resize_in_progress = false; wake_up_all(&files->resize_wait); return error; } static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt, bool set) { if (set) { __set_bit(fd, fdt->close_on_exec); } else { if (test_bit(fd, fdt->close_on_exec)) __clear_bit(fd, fdt->close_on_exec); } } static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt, bool set) { __set_bit(fd, fdt->open_fds); __set_close_on_exec(fd, fdt, set); fd /= BITS_PER_LONG; if (!~fdt->open_fds[fd]) __set_bit(fd, fdt->full_fds_bits); } static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt) { __clear_bit(fd, fdt->open_fds); fd /= BITS_PER_LONG; if (test_bit(fd, fdt->full_fds_bits)) __clear_bit(fd, fdt->full_fds_bits); } static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt) { return test_bit(fd, fdt->open_fds); } /* * Note that a sane fdtable size always has to be a multiple of * BITS_PER_LONG, since we have bitmaps that are sized by this. * * punch_hole is optional - when close_range() is asked to unshare * and close, we don't need to copy descriptors in that range, so * a smaller cloned descriptor table might suffice if the last * currently opened descriptor falls into that range. */ static unsigned int sane_fdtable_size(struct fdtable *fdt, struct fd_range *punch_hole) { unsigned int last = find_last_bit(fdt->open_fds, fdt->max_fds); if (last == fdt->max_fds) return NR_OPEN_DEFAULT; if (punch_hole && punch_hole->to >= last && punch_hole->from <= last) { last = find_last_bit(fdt->open_fds, punch_hole->from); if (last == punch_hole->from) return NR_OPEN_DEFAULT; } return ALIGN(last + 1, BITS_PER_LONG); } /* * Allocate a new descriptor table and copy contents from the passed in * instance. Returns a pointer to cloned table on success, ERR_PTR() * on failure. For 'punch_hole' see sane_fdtable_size(). */ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_hole) { struct files_struct *newf; struct file **old_fds, **new_fds; unsigned int open_files, i; struct fdtable *old_fdt, *new_fdt; newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); if (!newf) return ERR_PTR(-ENOMEM); atomic_set(&newf->count, 1); spin_lock_init(&newf->file_lock); newf->resize_in_progress = false; init_waitqueue_head(&newf->resize_wait); newf->next_fd = 0; new_fdt = &newf->fdtab; new_fdt->max_fds = NR_OPEN_DEFAULT; new_fdt->close_on_exec = newf->close_on_exec_init; new_fdt->open_fds = newf->open_fds_init; new_fdt->full_fds_bits = newf->full_fds_bits_init; new_fdt->fd = &newf->fd_array[0]; spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); open_files = sane_fdtable_size(old_fdt, punch_hole); /* * Check whether we need to allocate a larger fd array and fd set. */ while (unlikely(open_files > new_fdt->max_fds)) { spin_unlock(&oldf->file_lock); if (new_fdt != &newf->fdtab) __free_fdtable(new_fdt); new_fdt = alloc_fdtable(open_files); if (IS_ERR(new_fdt)) { kmem_cache_free(files_cachep, newf); return ERR_CAST(new_fdt); } /* * Reacquire the oldf lock and a pointer to its fd table * who knows it may have a new bigger fd table. We need * the latest pointer. */ spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); open_files = sane_fdtable_size(old_fdt, punch_hole); } copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG); old_fds = old_fdt->fd; new_fds = new_fdt->fd; /* * We may be racing against fd allocation from other threads using this * files_struct, despite holding ->file_lock. * * alloc_fd() might have already claimed a slot, while fd_install() * did not populate it yet. Note the latter operates locklessly, so * the file can show up as we are walking the array below. * * At the same time we know no files will disappear as all other * operations take the lock. * * Instead of trying to placate userspace racing with itself, we * ref the file if we see it and mark the fd slot as unused otherwise. */ for (i = open_files; i != 0; i--) { struct file *f = rcu_dereference_raw(*old_fds++); if (f) { get_file(f); } else { __clear_open_fd(open_files - i, new_fdt); } rcu_assign_pointer(*new_fds++, f); } spin_unlock(&oldf->file_lock); /* clear the remainder */ memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *)); rcu_assign_pointer(newf->fdt, new_fdt); return newf; } static struct fdtable *close_files(struct files_struct * files) { /* * It is safe to dereference the fd table without RCU or * ->file_lock because this is the last reference to the * files structure. */ struct fdtable *fdt = rcu_dereference_raw(files->fdt); unsigned int i, j = 0; for (;;) { unsigned long set; i = j * BITS_PER_LONG; if (i >= fdt->max_fds) break; set = fdt->open_fds[j++]; while (set) { if (set & 1) { struct file *file = fdt->fd[i]; if (file) { filp_close(file, files); cond_resched(); } } i++; set >>= 1; } } return fdt; } void put_files_struct(struct files_struct *files) { if (atomic_dec_and_test(&files->count)) { struct fdtable *fdt = close_files(files); /* free the arrays if they are not embedded */ if (fdt != &files->fdtab) __free_fdtable(fdt); kmem_cache_free(files_cachep, files); } } void exit_files(struct task_struct *tsk) { struct files_struct * files = tsk->files; if (files) { task_lock(tsk); tsk->files = NULL; task_unlock(tsk); put_files_struct(files); } } struct files_struct init_files = { .count = ATOMIC_INIT(1), .fdt = &init_files.fdtab, .fdtab = { .max_fds = NR_OPEN_DEFAULT, .fd = &init_files.fd_array[0], .close_on_exec = init_files.close_on_exec_init, .open_fds = init_files.open_fds_init, .full_fds_bits = init_files.full_fds_bits_init, }, .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait), }; static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) { unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */ unsigned int maxbit = maxfd / BITS_PER_LONG; unsigned int bitbit = start / BITS_PER_LONG; unsigned int bit; /* * Try to avoid looking at the second level bitmap */ bit = find_next_zero_bit(&fdt->open_fds[bitbit], BITS_PER_LONG, start & (BITS_PER_LONG - 1)); if (bit < BITS_PER_LONG) return bit + bitbit * BITS_PER_LONG; bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG; if (bitbit >= maxfd) return maxfd; if (bitbit > start) start = bitbit; return find_next_zero_bit(fdt->open_fds, maxfd, start); } /* * allocate a file descriptor, mark it busy. */ static int alloc_fd(unsigned start, unsigned end, unsigned flags) { struct files_struct *files = current->files; unsigned int fd; int error; struct fdtable *fdt; spin_lock(&files->file_lock); repeat: fdt = files_fdtable(files); fd = start; if (fd < files->next_fd) fd = files->next_fd; if (likely(fd < fdt->max_fds)) fd = find_next_fd(fdt, fd); /* * N.B. For clone tasks sharing a files structure, this test * will limit the total number of files that can be opened. */ error = -EMFILE; if (unlikely(fd >= end)) goto out; if (unlikely(fd >= fdt->max_fds)) { error = expand_files(files, fd); if (error < 0) goto out; goto repeat; } if (start <= files->next_fd) files->next_fd = fd + 1; __set_open_fd(fd, fdt, flags & O_CLOEXEC); error = fd; VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); out: spin_unlock(&files->file_lock); return error; } int __get_unused_fd_flags(unsigned flags, unsigned long nofile) { return alloc_fd(0, nofile, flags); } int get_unused_fd_flags(unsigned flags) { return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE)); } EXPORT_SYMBOL(get_unused_fd_flags); static void __put_unused_fd(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = files_fdtable(files); __clear_open_fd(fd, fdt); if (fd < files->next_fd) files->next_fd = fd; } void put_unused_fd(unsigned int fd) { struct files_struct *files = current->files; spin_lock(&files->file_lock); __put_unused_fd(files, fd); spin_unlock(&files->file_lock); } EXPORT_SYMBOL(put_unused_fd); /* * Install a file pointer in the fd array while it is being resized. * * We need to make sure our update to the array does not get lost as the resizing * thread can be copying the content as we modify it. * * We have two ways to do it: * - go off CPU waiting for resize_in_progress to clear * - take the spin lock * * The latter is trivial to implement and saves us from having to might_sleep() * for debugging purposes. * * This is moved out of line from fd_install() to convince gcc to optimize that * routine better. */ static void noinline fd_install_slowpath(unsigned int fd, struct file *file) { struct files_struct *files = current->files; struct fdtable *fdt; spin_lock(&files->file_lock); fdt = files_fdtable(files); VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); rcu_assign_pointer(fdt->fd[fd], file); spin_unlock(&files->file_lock); } /** * fd_install - install a file pointer in the fd array * @fd: file descriptor to install the file in * @file: the file to install * * This consumes the "file" refcount, so callers should treat it * as if they had called fput(file). */ void fd_install(unsigned int fd, struct file *file) { struct files_struct *files = current->files; struct fdtable *fdt; if (WARN_ON_ONCE(unlikely(file->f_mode & FMODE_BACKING))) return; rcu_read_lock_sched(); if (unlikely(files->resize_in_progress)) { rcu_read_unlock_sched(); fd_install_slowpath(fd, file); return; } /* coupled with smp_wmb() in expand_fdtable() */ smp_rmb(); fdt = rcu_dereference_sched(files->fdt); VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); rcu_assign_pointer(fdt->fd[fd], file); rcu_read_unlock_sched(); } EXPORT_SYMBOL(fd_install); /** * file_close_fd_locked - return file associated with fd * @files: file struct to retrieve file from * @fd: file descriptor to retrieve file for * * Doesn't take a separate reference count. * * Context: files_lock must be held. * * Returns: The file associated with @fd (NULL if @fd is not open) */ struct file *file_close_fd_locked(struct files_struct *files, unsigned fd) { struct fdtable *fdt = files_fdtable(files); struct file *file; lockdep_assert_held(&files->file_lock); if (fd >= fdt->max_fds) return NULL; fd = array_index_nospec(fd, fdt->max_fds); file = rcu_dereference_raw(fdt->fd[fd]); if (file) { rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); } return file; } int close_fd(unsigned fd) { struct files_struct *files = current->files; struct file *file; spin_lock(&files->file_lock); file = file_close_fd_locked(files, fd); spin_unlock(&files->file_lock); if (!file) return -EBADF; return filp_close(file, files); } EXPORT_SYMBOL(close_fd); /** * last_fd - return last valid index into fd table * @fdt: File descriptor table. * * Context: Either rcu read lock or files_lock must be held. * * Returns: Last valid index into fdtable. */ static inline unsigned last_fd(struct fdtable *fdt) { return fdt->max_fds - 1; } static inline void __range_cloexec(struct files_struct *cur_fds, unsigned int fd, unsigned int max_fd) { struct fdtable *fdt; /* make sure we're using the correct maximum value */ spin_lock(&cur_fds->file_lock); fdt = files_fdtable(cur_fds); max_fd = min(last_fd(fdt), max_fd); if (fd <= max_fd) bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1); spin_unlock(&cur_fds->file_lock); } static inline void __range_close(struct files_struct *files, unsigned int fd, unsigned int max_fd) { struct file *file; unsigned n; spin_lock(&files->file_lock); n = last_fd(files_fdtable(files)); max_fd = min(max_fd, n); for (; fd <= max_fd; fd++) { file = file_close_fd_locked(files, fd); if (file) { spin_unlock(&files->file_lock); filp_close(file, files); cond_resched(); spin_lock(&files->file_lock); } else if (need_resched()) { spin_unlock(&files->file_lock); cond_resched(); spin_lock(&files->file_lock); } } spin_unlock(&files->file_lock); } /** * sys_close_range() - Close all file descriptors in a given range. * * @fd: starting file descriptor to close * @max_fd: last file descriptor to close * @flags: CLOSE_RANGE flags. * * This closes a range of file descriptors. All file descriptors * from @fd up to and including @max_fd are closed. * Currently, errors to close a given file descriptor are ignored. */ SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd, unsigned int, flags) { struct task_struct *me = current; struct files_struct *cur_fds = me->files, *fds = NULL; if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC)) return -EINVAL; if (fd > max_fd) return -EINVAL; if ((flags & CLOSE_RANGE_UNSHARE) && atomic_read(&cur_fds->count) > 1) { struct fd_range range = {fd, max_fd}, *punch_hole = ⦥ /* * If the caller requested all fds to be made cloexec we always * copy all of the file descriptors since they still want to * use them. */ if (flags & CLOSE_RANGE_CLOEXEC) punch_hole = NULL; fds = dup_fd(cur_fds, punch_hole); if (IS_ERR(fds)) return PTR_ERR(fds); /* * We used to share our file descriptor table, and have now * created a private one, make sure we're using it below. */ swap(cur_fds, fds); } if (flags & CLOSE_RANGE_CLOEXEC) __range_cloexec(cur_fds, fd, max_fd); else __range_close(cur_fds, fd, max_fd); if (fds) { /* * We're done closing the files we were supposed to. Time to install * the new file descriptor table and drop the old one. */ task_lock(me); me->files = cur_fds; task_unlock(me); put_files_struct(fds); } return 0; } /** * file_close_fd - return file associated with fd * @fd: file descriptor to retrieve file for * * Doesn't take a separate reference count. * * Returns: The file associated with @fd (NULL if @fd is not open) */ struct file *file_close_fd(unsigned int fd) { struct files_struct *files = current->files; struct file *file; spin_lock(&files->file_lock); file = file_close_fd_locked(files, fd); spin_unlock(&files->file_lock); return file; } void do_close_on_exec(struct files_struct *files) { unsigned i; struct fdtable *fdt; /* exec unshares first */ spin_lock(&files->file_lock); for (i = 0; ; i++) { unsigned long set; unsigned fd = i * BITS_PER_LONG; fdt = files_fdtable(files); if (fd >= fdt->max_fds) break; set = fdt->close_on_exec[i]; if (!set) continue; fdt->close_on_exec[i] = 0; for ( ; set ; fd++, set >>= 1) { struct file *file; if (!(set & 1)) continue; file = fdt->fd[fd]; if (!file) continue; rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); spin_unlock(&files->file_lock); filp_close(file, files); cond_resched(); spin_lock(&files->file_lock); } } spin_unlock(&files->file_lock); } static struct file *__get_file_rcu(struct file __rcu **f) { struct file __rcu *file; struct file __rcu *file_reloaded; struct file __rcu *file_reloaded_cmp; file = rcu_dereference_raw(*f); if (!file) return NULL; if (unlikely(!file_ref_get(&file->f_ref))) return ERR_PTR(-EAGAIN); file_reloaded = rcu_dereference_raw(*f); /* * Ensure that all accesses have a dependency on the load from * rcu_dereference_raw() above so we get correct ordering * between reuse/allocation and the pointer check below. */ file_reloaded_cmp = file_reloaded; OPTIMIZER_HIDE_VAR(file_reloaded_cmp); /* * file_ref_get() above provided a full memory barrier when we * acquired a reference. * * This is paired with the write barrier from assigning to the * __rcu protected file pointer so that if that pointer still * matches the current file, we know we have successfully * acquired a reference to the right file. * * If the pointers don't match the file has been reallocated by * SLAB_TYPESAFE_BY_RCU. */ if (file == file_reloaded_cmp) return file_reloaded; fput(file); return ERR_PTR(-EAGAIN); } /** * get_file_rcu - try go get a reference to a file under rcu * @f: the file to get a reference on * * This function tries to get a reference on @f carefully verifying that * @f hasn't been reused. * * This function should rarely have to be used and only by users who * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it. * * Return: Returns @f with the reference count increased or NULL. */ struct file *get_file_rcu(struct file __rcu **f) { for (;;) { struct file __rcu *file; file = __get_file_rcu(f); if (!IS_ERR(file)) return file; } } EXPORT_SYMBOL_GPL(get_file_rcu); /** * get_file_active - try go get a reference to a file * @f: the file to get a reference on * * In contast to get_file_rcu() the pointer itself isn't part of the * reference counting. * * This function should rarely have to be used and only by users who * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it. * * Return: Returns @f with the reference count increased or NULL. */ struct file *get_file_active(struct file **f) { struct file __rcu *file; rcu_read_lock(); file = __get_file_rcu(f); rcu_read_unlock(); if (IS_ERR(file)) file = NULL; return file; } EXPORT_SYMBOL_GPL(get_file_active); static inline struct file *__fget_files_rcu(struct files_struct *files, unsigned int fd, fmode_t mask) { for (;;) { struct file *file; struct fdtable *fdt = rcu_dereference_raw(files->fdt); struct file __rcu **fdentry; unsigned long nospec_mask; /* Mask is a 0 for invalid fd's, ~0 for valid ones */ nospec_mask = array_index_mask_nospec(fd, fdt->max_fds); /* * fdentry points to the 'fd' offset, or fdt->fd[0]. * Loading from fdt->fd[0] is always safe, because the * array always exists. */ fdentry = fdt->fd + (fd & nospec_mask); /* Do the load, then mask any invalid result */ file = rcu_dereference_raw(*fdentry); file = (void *)(nospec_mask & (unsigned long)file); if (unlikely(!file)) return NULL; /* * Ok, we have a file pointer that was valid at * some point, but it might have become stale since. * * We need to confirm it by incrementing the refcount * and then check the lookup again. * * file_ref_get() gives us a full memory barrier. We * only really need an 'acquire' one to protect the * loads below, but we don't have that. */ if (unlikely(!file_ref_get(&file->f_ref))) continue; /* * Such a race can take two forms: * * (a) the file ref already went down to zero and the * file hasn't been reused yet or the file count * isn't zero but the file has already been reused. * * (b) the file table entry has changed under us. * Note that we don't need to re-check the 'fdt->fd' * pointer having changed, because it always goes * hand-in-hand with 'fdt'. * * If so, we need to put our ref and try again. */ if (unlikely(file != rcu_dereference_raw(*fdentry)) || unlikely(rcu_dereference_raw(files->fdt) != fdt)) { fput(file); continue; } /* * This isn't the file we're looking for or we're not * allowed to get a reference to it. */ if (unlikely(file->f_mode & mask)) { fput(file); return NULL; } /* * Ok, we have a ref to the file, and checked that it * still exists. */ return file; } } static struct file *__fget_files(struct files_struct *files, unsigned int fd, fmode_t mask) { struct file *file; rcu_read_lock(); file = __fget_files_rcu(files, fd, mask); rcu_read_unlock(); return file; } static inline struct file *__fget(unsigned int fd, fmode_t mask) { return __fget_files(current->files, fd, mask); } struct file *fget(unsigned int fd) { return __fget(fd, FMODE_PATH); } EXPORT_SYMBOL(fget); struct file *fget_raw(unsigned int fd) { return __fget(fd, 0); } EXPORT_SYMBOL(fget_raw); struct file *fget_task(struct task_struct *task, unsigned int fd) { struct file *file = NULL; task_lock(task); if (task->files) file = __fget_files(task->files, fd, 0); task_unlock(task); return file; } struct file *fget_task_next(struct task_struct *task, unsigned int *ret_fd) { /* Must be called with rcu_read_lock held */ struct files_struct *files; unsigned int fd = *ret_fd; struct file *file = NULL; task_lock(task); files = task->files; if (files) { rcu_read_lock(); for (; fd < files_fdtable(files)->max_fds; fd++) { file = __fget_files_rcu(files, fd, 0); if (file) break; } rcu_read_unlock(); } task_unlock(task); *ret_fd = fd; return file; } EXPORT_SYMBOL(fget_task_next); /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. * * You can use this instead of fget if you satisfy all of the following * conditions: * 1) You must call fput_light before exiting the syscall and returning control * to userspace (i.e. you cannot remember the returned struct file * after * returning to userspace). * 2) You must not call filp_close on the returned struct file * in between * calls to fget_light and fput_light. * 3) You must not clone the current task in between the calls to fget_light * and fput_light. * * The fput_needed flag returned by fget_light should be passed to the * corresponding fput_light. * * (As an exception to rule 2, you can call filp_close between fget_light and * fput_light provided that you capture a real refcount with get_file before * the call to filp_close, and ensure that this real refcount is fput *after* * the fput_light call.) * * See also the documentation in rust/kernel/file.rs. */ static inline struct fd __fget_light(unsigned int fd, fmode_t mask) { struct files_struct *files = current->files; struct file *file; /* * If another thread is concurrently calling close_fd() followed * by put_files_struct(), we must not observe the old table * entry combined with the new refcount - otherwise we could * return a file that is concurrently being freed. * * atomic_read_acquire() pairs with atomic_dec_and_test() in * put_files_struct(). */ if (likely(atomic_read_acquire(&files->count) == 1)) { file = files_lookup_fd_raw(files, fd); if (!file || unlikely(file->f_mode & mask)) return EMPTY_FD; return BORROWED_FD(file); } else { file = __fget_files(files, fd, mask); if (!file) return EMPTY_FD; return CLONED_FD(file); } } struct fd fdget(unsigned int fd) { return __fget_light(fd, FMODE_PATH); } EXPORT_SYMBOL(fdget); struct fd fdget_raw(unsigned int fd) { return __fget_light(fd, 0); } /* * Try to avoid f_pos locking. We only need it if the * file is marked for FMODE_ATOMIC_POS, and it can be * accessed multiple ways. * * Always do it for directories, because pidfd_getfd() * can make a file accessible even if it otherwise would * not be, and for directories this is a correctness * issue, not a "POSIX requirement". */ static inline bool file_needs_f_pos_lock(struct file *file) { if (!(file->f_mode & FMODE_ATOMIC_POS)) return false; if (__file_ref_read_raw(&file->f_ref) != FILE_REF_ONEREF) return true; if (file->f_op->iterate_shared) return true; return false; } bool file_seek_cur_needs_f_lock(struct file *file) { if (!(file->f_mode & FMODE_ATOMIC_POS) && !file->f_op->iterate_shared) return false; /* * Note that we are not guaranteed to be called after fdget_pos() on * this file obj, in which case the caller is expected to provide the * appropriate locking. */ return true; } struct fd fdget_pos(unsigned int fd) { struct fd f = fdget(fd); struct file *file = fd_file(f); if (likely(file) && file_needs_f_pos_lock(file)) { f.word |= FDPUT_POS_UNLOCK; mutex_lock(&file->f_pos_lock); } return f; } void __f_unlock_pos(struct file *f) { mutex_unlock(&f->f_pos_lock); } /* * We only lock f_pos if we have threads or if the file might be * shared with another process. In both cases we'll have an elevated * file count (done either by fdget() or by fork()). */ void set_close_on_exec(unsigned int fd, int flag) { struct files_struct *files = current->files; spin_lock(&files->file_lock); __set_close_on_exec(fd, files_fdtable(files), flag); spin_unlock(&files->file_lock); } bool get_close_on_exec(unsigned int fd) { bool res; rcu_read_lock(); res = close_on_exec(fd, current->files); rcu_read_unlock(); return res; } static int do_dup2(struct files_struct *files, struct file *file, unsigned fd, unsigned flags) __releases(&files->file_lock) { struct file *tofree; struct fdtable *fdt; /* * dup2() is expected to close the file installed in the target fd slot * (if any). However, userspace hand-picking a fd may be racing against * its own threads which happened to allocate it in open() et al but did * not populate it yet. * * Broadly speaking we may be racing against the following: * fd = get_unused_fd_flags(); // fd slot reserved, ->fd[fd] == NULL * file = hard_work_goes_here(); * fd_install(fd, file); // only now ->fd[fd] == file * * It is an invariant that a successfully allocated fd has a NULL entry * in the array until the matching fd_install(). * * If we fit the window, we have the fd to populate, yet no target file * to close. Trying to ignore it and install our new file would violate * the invariant and make fd_install() overwrite our file. * * Things can be done(tm) to handle this. However, the issue does not * concern legitimate programs and we only need to make sure the kernel * does not trip over it. * * The simplest way out is to return an error if we find ourselves here. * * POSIX is silent on the issue, we return -EBUSY. */ fdt = files_fdtable(files); fd = array_index_nospec(fd, fdt->max_fds); tofree = rcu_dereference_raw(fdt->fd[fd]); if (!tofree && fd_is_open(fd, fdt)) goto Ebusy; get_file(file); rcu_assign_pointer(fdt->fd[fd], file); __set_open_fd(fd, fdt, flags & O_CLOEXEC); spin_unlock(&files->file_lock); if (tofree) filp_close(tofree, files); return fd; Ebusy: spin_unlock(&files->file_lock); return -EBUSY; } int replace_fd(unsigned fd, struct file *file, unsigned flags) { int err; struct files_struct *files = current->files; if (!file) return close_fd(fd); if (fd >= rlimit(RLIMIT_NOFILE)) return -EBADF; spin_lock(&files->file_lock); err = expand_files(files, fd); if (unlikely(err < 0)) goto out_unlock; err = do_dup2(files, file, fd, flags); if (err < 0) return err; return 0; out_unlock: spin_unlock(&files->file_lock); return err; } /** * receive_fd() - Install received file into file descriptor table * @file: struct file that was received from another process * @ufd: __user pointer to write new fd number to * @o_flags: the O_* flags to apply to the new fd entry * * Installs a received file into the file descriptor table, with appropriate * checks and count updates. Optionally writes the fd number to userspace, if * @ufd is non-NULL. * * This helper handles its own reference counting of the incoming * struct file. * * Returns newly install fd or -ve on error. */ int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) { int error; error = security_file_receive(file); if (error) return error; FD_PREPARE(fdf, o_flags, file); if (fdf.err) return fdf.err; get_file(file); if (ufd) { error = put_user(fd_prepare_fd(fdf), ufd); if (error) return error; } __receive_sock(fd_prepare_file(fdf)); return fd_publish(fdf); } EXPORT_SYMBOL_GPL(receive_fd); int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags) { int error; error = security_file_receive(file); if (error) return error; error = replace_fd(new_fd, file, o_flags); if (error) return error; __receive_sock(file); return new_fd; } static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags) { int err = -EBADF; struct file *file; struct files_struct *files = current->files; if ((flags & ~O_CLOEXEC) != 0) return -EINVAL; if (unlikely(oldfd == newfd)) return -EINVAL; if (newfd >= rlimit(RLIMIT_NOFILE)) return -EBADF; spin_lock(&files->file_lock); err = expand_files(files, newfd); file = files_lookup_fd_locked(files, oldfd); if (unlikely(!file)) goto Ebadf; if (unlikely(err < 0)) { if (err == -EMFILE) goto Ebadf; goto out_unlock; } return do_dup2(files, file, newfd, flags); Ebadf: err = -EBADF; out_unlock: spin_unlock(&files->file_lock); return err; } SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) { return ksys_dup3(oldfd, newfd, flags); } SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) { if (unlikely(newfd == oldfd)) { /* corner case */ struct files_struct *files = current->files; struct file *f; int retval = oldfd; rcu_read_lock(); f = __fget_files_rcu(files, oldfd, 0); if (!f) retval = -EBADF; rcu_read_unlock(); if (f) fput(f); return retval; } return ksys_dup3(oldfd, newfd, 0); } SYSCALL_DEFINE1(dup, unsigned int, fildes) { int ret = -EBADF; struct file *file = fget_raw(fildes); if (file) { ret = get_unused_fd_flags(0); if (ret >= 0) fd_install(ret, file); else fput(file); } return ret; } int f_dupfd(unsigned int from, struct file *file, unsigned flags) { unsigned long nofile = rlimit(RLIMIT_NOFILE); int err; if (from >= nofile) return -EINVAL; err = alloc_fd(from, nofile, flags); if (err >= 0) { get_file(file); fd_install(err, file); } return err; } int iterate_fd(struct files_struct *files, unsigned n, int (*f)(const void *, struct file *, unsigned), const void *p) { struct fdtable *fdt; int res = 0; if (!files) return 0; spin_lock(&files->file_lock); for (fdt = files_fdtable(files); n < fdt->max_fds; n++) { struct file *file; file = rcu_dereference_check_fdtable(files, fdt->fd[n]); if (!file) continue; res = f(p, file, n); if (res) break; } spin_unlock(&files->file_lock); return res; } EXPORT_SYMBOL(iterate_fd); |
| 2 29 5 102 102 49 232 490 491 234 5 6 2 1 9 1 1 7 54 18 15 23 16 15 3 3 2 1 5 2 2 2 2 26 26 19 12 9 5 17 6 5 11 23 8 6 23 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _INET_ECN_H_ #define _INET_ECN_H_ #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/if_vlan.h> #include <net/inet_sock.h> #include <net/dsfield.h> #include <net/checksum.h> enum { INET_ECN_NOT_ECT = 0, INET_ECN_ECT_1 = 1, INET_ECN_ECT_0 = 2, INET_ECN_CE = 3, INET_ECN_MASK = 3, }; extern int sysctl_tunnel_ecn_log; static inline int INET_ECN_is_ce(__u8 dsfield) { return (dsfield & INET_ECN_MASK) == INET_ECN_CE; } static inline int INET_ECN_is_not_ect(__u8 dsfield) { return (dsfield & INET_ECN_MASK) == INET_ECN_NOT_ECT; } static inline int INET_ECN_is_capable(__u8 dsfield) { return dsfield & INET_ECN_ECT_0; } /* * RFC 3168 9.1.1 * The full-functionality option for ECN encapsulation is to copy the * ECN codepoint of the inside header to the outside header on * encapsulation if the inside header is not-ECT or ECT, and to set the * ECN codepoint of the outside header to ECT(0) if the ECN codepoint of * the inside header is CE. */ static inline __u8 INET_ECN_encapsulate(__u8 outer, __u8 inner) { outer &= ~INET_ECN_MASK; outer |= !INET_ECN_is_ce(inner) ? (inner & INET_ECN_MASK) : INET_ECN_ECT_0; return outer; } static inline void INET_ECN_xmit(struct sock *sk) { inet_sk(sk)->tos |= INET_ECN_ECT_0; if (inet6_sk(sk) != NULL) inet6_sk(sk)->tclass |= INET_ECN_ECT_0; } static inline void INET_ECN_dontxmit(struct sock *sk) { inet_sk(sk)->tos &= ~INET_ECN_MASK; if (inet6_sk(sk) != NULL) inet6_sk(sk)->tclass &= ~INET_ECN_MASK; } #define IP6_ECN_flow_init(label) do { \ (label) &= ~htonl(INET_ECN_MASK << 20); \ } while (0) #define IP6_ECN_flow_xmit(sk, label) do { \ if (INET_ECN_is_capable(inet6_sk(sk)->tclass)) \ (label) |= htonl(INET_ECN_ECT_0 << 20); \ } while (0) static inline int IP_ECN_set_ce(struct iphdr *iph) { u32 ecn = (iph->tos + 1) & INET_ECN_MASK; __be16 check_add; /* * After the last operation we have (in binary): * INET_ECN_NOT_ECT => 01 * INET_ECN_ECT_1 => 10 * INET_ECN_ECT_0 => 11 * INET_ECN_CE => 00 */ if (!(ecn & 2)) return !ecn; /* * The following gives us: * INET_ECN_ECT_1 => check += htons(0xFFFD) * INET_ECN_ECT_0 => check += htons(0xFFFE) */ check_add = (__force __be16)((__force u16)htons(0xFFFB) + (__force u16)htons(ecn)); iph->check = csum16_add(iph->check, check_add); iph->tos |= INET_ECN_CE; return 1; } static inline int IP_ECN_set_ect1(struct iphdr *iph) { if ((iph->tos & INET_ECN_MASK) != INET_ECN_ECT_0) return 0; iph->check = csum16_add(iph->check, htons(0x1)); iph->tos ^= INET_ECN_MASK; return 1; } static inline void IP_ECN_clear(struct iphdr *iph) { iph->tos &= ~INET_ECN_MASK; } static inline void ipv4_copy_dscp(unsigned int dscp, struct iphdr *inner) { dscp &= ~INET_ECN_MASK; ipv4_change_dsfield(inner, INET_ECN_MASK, dscp); } struct ipv6hdr; /* Note: * IP_ECN_set_ce() has to tweak IPV4 checksum when setting CE, * meaning both changes have no effect on skb->csum if/when CHECKSUM_COMPLETE * In IPv6 case, no checksum compensates the change in IPv6 header, * so we have to update skb->csum. */ static inline int IP6_ECN_set_ce(struct sk_buff *skb, struct ipv6hdr *iph) { __be32 from, to; if (INET_ECN_is_not_ect(ipv6_get_dsfield(iph))) return 0; from = *(__be32 *)iph; to = from | htonl(INET_ECN_CE << 20); *(__be32 *)iph = to; if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_add(csum_sub(skb->csum, (__force __wsum)from), (__force __wsum)to); return 1; } static inline int IP6_ECN_set_ect1(struct sk_buff *skb, struct ipv6hdr *iph) { __be32 from, to; if ((ipv6_get_dsfield(iph) & INET_ECN_MASK) != INET_ECN_ECT_0) return 0; from = *(__be32 *)iph; to = from ^ htonl(INET_ECN_MASK << 20); *(__be32 *)iph = to; if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_add(csum_sub(skb->csum, (__force __wsum)from), (__force __wsum)to); return 1; } static inline void ipv6_copy_dscp(unsigned int dscp, struct ipv6hdr *inner) { dscp &= ~INET_ECN_MASK; ipv6_change_dsfield(inner, INET_ECN_MASK, dscp); } static inline int INET_ECN_set_ce(struct sk_buff *skb) { switch (skb_protocol(skb, true)) { case cpu_to_be16(ETH_P_IP): if (skb_network_header(skb) + sizeof(struct iphdr) <= skb_tail_pointer(skb)) return IP_ECN_set_ce(ip_hdr(skb)); break; case cpu_to_be16(ETH_P_IPV6): if (skb_network_header(skb) + sizeof(struct ipv6hdr) <= skb_tail_pointer(skb)) return IP6_ECN_set_ce(skb, ipv6_hdr(skb)); break; } return 0; } static inline int skb_get_dsfield(struct sk_buff *skb) { switch (skb_protocol(skb, true)) { case cpu_to_be16(ETH_P_IP): if (!pskb_network_may_pull(skb, sizeof(struct iphdr))) break; return ipv4_get_dsfield(ip_hdr(skb)); case cpu_to_be16(ETH_P_IPV6): if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr))) break; return ipv6_get_dsfield(ipv6_hdr(skb)); } return -1; } static inline int INET_ECN_set_ect1(struct sk_buff *skb) { switch (skb_protocol(skb, true)) { case cpu_to_be16(ETH_P_IP): if (skb_network_header(skb) + sizeof(struct iphdr) <= skb_tail_pointer(skb)) return IP_ECN_set_ect1(ip_hdr(skb)); break; case cpu_to_be16(ETH_P_IPV6): if (skb_network_header(skb) + sizeof(struct ipv6hdr) <= skb_tail_pointer(skb)) return IP6_ECN_set_ect1(skb, ipv6_hdr(skb)); break; } return 0; } /* * RFC 6040 4.2 * To decapsulate the inner header at the tunnel egress, a compliant * tunnel egress MUST set the outgoing ECN field to the codepoint at the * intersection of the appropriate arriving inner header (row) and outer * header (column) in Figure 4 * * +---------+------------------------------------------------+ * |Arriving | Arriving Outer Header | * | Inner +---------+------------+------------+------------+ * | Header | Not-ECT | ECT(0) | ECT(1) | CE | * +---------+---------+------------+------------+------------+ * | Not-ECT | Not-ECT |Not-ECT(!!!)|Not-ECT(!!!)| <drop>(!!!)| * | ECT(0) | ECT(0) | ECT(0) | ECT(1) | CE | * | ECT(1) | ECT(1) | ECT(1) (!) | ECT(1) | CE | * | CE | CE | CE | CE(!!!)| CE | * +---------+---------+------------+------------+------------+ * * Figure 4: New IP in IP Decapsulation Behaviour * * returns 0 on success * 1 if something is broken and should be logged (!!! above) * 2 if packet should be dropped */ static inline int __INET_ECN_decapsulate(__u8 outer, __u8 inner, bool *set_ce) { if (INET_ECN_is_not_ect(inner)) { switch (outer & INET_ECN_MASK) { case INET_ECN_NOT_ECT: return 0; case INET_ECN_ECT_0: case INET_ECN_ECT_1: return 1; case INET_ECN_CE: return 2; } } *set_ce = INET_ECN_is_ce(outer); return 0; } static inline int INET_ECN_decapsulate(struct sk_buff *skb, __u8 outer, __u8 inner) { bool set_ce = false; int rc; rc = __INET_ECN_decapsulate(outer, inner, &set_ce); if (!rc) { if (set_ce) INET_ECN_set_ce(skb); else if ((outer & INET_ECN_MASK) == INET_ECN_ECT_1) INET_ECN_set_ect1(skb); } return rc; } static inline int IP_ECN_decapsulate(const struct iphdr *oiph, struct sk_buff *skb) { __u8 inner; switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): inner = ip_hdr(skb)->tos; break; case htons(ETH_P_IPV6): inner = ipv6_get_dsfield(ipv6_hdr(skb)); break; default: return 0; } return INET_ECN_decapsulate(skb, oiph->tos, inner); } static inline int IP6_ECN_decapsulate(const struct ipv6hdr *oipv6h, struct sk_buff *skb) { __u8 inner; switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): inner = ip_hdr(skb)->tos; break; case htons(ETH_P_IPV6): inner = ipv6_get_dsfield(ipv6_hdr(skb)); break; default: return 0; } return INET_ECN_decapsulate(skb, ipv6_get_dsfield(oipv6h), inner); } #endif |
| 6520 3678 3678 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM csd #if !defined(_TRACE_CSD_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_CSD_H #include <linux/tracepoint.h> TRACE_EVENT(csd_queue_cpu, TP_PROTO(const unsigned int cpu, unsigned long callsite, smp_call_func_t func, call_single_data_t *csd), TP_ARGS(cpu, callsite, func, csd), TP_STRUCT__entry( __field(unsigned int, cpu) __field(void *, callsite) __field(void *, func) __field(void *, csd) ), TP_fast_assign( __entry->cpu = cpu; __entry->callsite = (void *)callsite; __entry->func = func; __entry->csd = csd; ), TP_printk("cpu=%u callsite=%pS func=%ps csd=%p", __entry->cpu, __entry->callsite, __entry->func, __entry->csd) ); /* * Tracepoints for a function which is called as an effect of smp_call_function.* */ DECLARE_EVENT_CLASS(csd_function, TP_PROTO(smp_call_func_t func, call_single_data_t *csd), TP_ARGS(func, csd), TP_STRUCT__entry( __field(void *, func) __field(void *, csd) ), TP_fast_assign( __entry->func = func; __entry->csd = csd; ), TP_printk("func=%ps, csd=%p", __entry->func, __entry->csd) ); DEFINE_EVENT(csd_function, csd_function_entry, TP_PROTO(smp_call_func_t func, call_single_data_t *csd), TP_ARGS(func, csd) ); DEFINE_EVENT(csd_function, csd_function_exit, TP_PROTO(smp_call_func_t func, call_single_data_t *csd), TP_ARGS(func, csd) ); #endif /* _TRACE_CSD_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 5 5 4 5 5 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 | /* SPDX-License-Identifier: GPL-2.0 */ #ifdef pr_fmt #undef pr_fmt #endif #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/fs.h> #include <linux/buffer_head.h> #include "amigaffs.h" #include <linux/mutex.h> #include <linux/workqueue.h> /* Ugly macros make the code more pretty. */ #define AFFS_BLOCK(sb, bh, blk) (AFFS_HEAD(bh)->table[AFFS_SB(sb)->s_hashsize-1-(blk)]) #define AFFS_HEAD(bh) ((struct affs_head *)(bh)->b_data) #define AFFS_TAIL(sb, bh) ((struct affs_tail *)((bh)->b_data+(sb)->s_blocksize-sizeof(struct affs_tail))) #define AFFS_ROOT_HEAD(bh) ((struct affs_root_head *)(bh)->b_data) #define AFFS_ROOT_TAIL(sb, bh) ((struct affs_root_tail *)((bh)->b_data+(sb)->s_blocksize-sizeof(struct affs_root_tail))) #define AFFS_DATA_HEAD(bh) ((struct affs_data_head *)(bh)->b_data) #define AFFS_DATA(bh) (((struct affs_data_head *)(bh)->b_data)->data) #define AFFS_CACHE_SIZE PAGE_SIZE #define AFFS_LC_SIZE (AFFS_CACHE_SIZE/sizeof(u32)/2) #define AFFS_AC_SIZE (AFFS_CACHE_SIZE/sizeof(struct affs_ext_key)/2) #define AFFS_AC_MASK (AFFS_AC_SIZE-1) #define AFFSNAMEMAX 30U struct affs_ext_key { u32 ext; /* idx of the extended block */ u32 key; /* block number */ }; /* * affs fs inode data in memory */ struct affs_inode_info { atomic_t i_opencnt; struct mutex i_link_lock; /* Protects internal inode access. */ struct mutex i_ext_lock; /* Protects internal inode access. */ #define i_hash_lock i_ext_lock u32 i_blkcnt; /* block count */ u32 i_extcnt; /* extended block count */ u32 *i_lc; /* linear cache of extended blocks */ u32 i_lc_size; u32 i_lc_shift; u32 i_lc_mask; struct affs_ext_key *i_ac; /* associative cache of extended blocks */ u32 i_ext_last; /* last accessed extended block */ struct buffer_head *i_ext_bh; /* bh of last extended block */ loff_t mmu_private; u32 i_protect; /* unused attribute bits */ u32 i_lastalloc; /* last allocated block */ int i_pa_cnt; /* number of preallocated blocks */ struct inode vfs_inode; }; /* short cut to get to the affs specific inode data */ static inline struct affs_inode_info *AFFS_I(struct inode *inode) { return container_of(inode, struct affs_inode_info, vfs_inode); } /* * super-block data in memory * * Block numbers are adjusted for their actual size * */ struct affs_bm_info { u32 bm_key; /* Disk block number */ u32 bm_free; /* Free blocks in here */ }; struct affs_sb_info { int s_partition_size; /* Partition size in blocks. */ int s_reserved; /* Number of reserved blocks. */ //u32 s_blksize; /* Initial device blksize */ u32 s_data_blksize; /* size of the data block w/o header */ u32 s_root_block; /* FFS root block number. */ int s_hashsize; /* Size of hash table. */ unsigned long s_flags; /* See below. */ kuid_t s_uid; /* uid to override */ kgid_t s_gid; /* gid to override */ umode_t s_mode; /* mode to override */ struct buffer_head *s_root_bh; /* Cached root block. */ struct mutex s_bmlock; /* Protects bitmap access. */ struct affs_bm_info *s_bitmap; /* Bitmap infos. */ u32 s_bmap_count; /* # of bitmap blocks. */ u32 s_bmap_bits; /* # of bits in one bitmap blocks */ u32 s_last_bmap; struct buffer_head *s_bmap_bh; char *s_prefix; /* Prefix for volumes and assigns. */ char s_volume[32]; /* Volume prefix for absolute symlinks. */ spinlock_t symlink_lock; /* protects the previous two */ struct super_block *sb; /* the VFS superblock object */ int work_queued; /* non-zero delayed work is queued */ struct delayed_work sb_work; /* superblock flush delayed work */ spinlock_t work_lock; /* protects sb_work and work_queued */ struct rcu_head rcu; }; #define AFFS_MOUNT_SF_INTL 0x0001 /* International filesystem. */ #define AFFS_MOUNT_SF_BM_VALID 0x0002 /* Bitmap is valid. */ #define AFFS_MOUNT_SF_IMMUTABLE 0x0004 /* Protection bits cannot be changed */ #define AFFS_MOUNT_SF_QUIET 0x0008 /* chmod errors will be not reported */ #define AFFS_MOUNT_SF_SETUID 0x0010 /* Ignore Amiga uid */ #define AFFS_MOUNT_SF_SETGID 0x0020 /* Ignore Amiga gid */ #define AFFS_MOUNT_SF_SETMODE 0x0040 /* Ignore Amiga protection bits */ #define AFFS_MOUNT_SF_MUFS 0x0100 /* Use MUFS uid/gid mapping */ #define AFFS_MOUNT_SF_OFS 0x0200 /* Old filesystem */ #define AFFS_MOUNT_SF_PREFIX 0x0400 /* Buffer for prefix is allocated */ #define AFFS_MOUNT_SF_VERBOSE 0x0800 /* Talk about fs when mounting */ #define AFFS_MOUNT_SF_NO_TRUNCATE 0x1000 /* Don't truncate filenames */ #define affs_clear_opt(o, opt) (o &= ~AFFS_MOUNT_##opt) #define affs_set_opt(o, opt) (o |= AFFS_MOUNT_##opt) #define affs_test_opt(o, opt) ((o) & AFFS_MOUNT_##opt) /* short cut to get to the affs specific sb data */ static inline struct affs_sb_info *AFFS_SB(struct super_block *sb) { return sb->s_fs_info; } void affs_mark_sb_dirty(struct super_block *sb); /* amigaffs.c */ extern int affs_insert_hash(struct inode *inode, struct buffer_head *bh); extern int affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh); extern int affs_remove_header(struct dentry *dentry); extern u32 affs_checksum_block(struct super_block *sb, struct buffer_head *bh); extern void affs_fix_checksum(struct super_block *sb, struct buffer_head *bh); extern void affs_secs_to_datestamp(time64_t secs, struct affs_date *ds); extern umode_t affs_prot_to_mode(u32 prot); extern void affs_mode_to_prot(struct inode *inode); __printf(3, 4) extern void affs_error(struct super_block *sb, const char *function, const char *fmt, ...); __printf(3, 4) extern void affs_warning(struct super_block *sb, const char *function, const char *fmt, ...); extern bool affs_nofilenametruncate(const struct dentry *dentry); extern int affs_check_name(const unsigned char *name, int len, bool notruncate); extern int affs_copy_name(unsigned char *bstr, struct dentry *dentry); /* bitmap. c */ extern u32 affs_count_free_blocks(struct super_block *s); extern void affs_free_block(struct super_block *sb, u32 block); extern u32 affs_alloc_block(struct inode *inode, u32 goal); extern int affs_init_bitmap(struct super_block *sb, int *flags); extern void affs_free_bitmap(struct super_block *sb); /* namei.c */ extern const struct export_operations affs_export_ops; extern int affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len); extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int); extern int affs_unlink(struct inode *dir, struct dentry *dentry); extern int affs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool); extern struct dentry *affs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode); extern int affs_rmdir(struct inode *dir, struct dentry *dentry); extern int affs_link(struct dentry *olddentry, struct inode *dir, struct dentry *dentry); extern int affs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname); extern int affs_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); /* inode.c */ extern struct inode *affs_new_inode(struct inode *dir); extern int affs_notify_change(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); extern void affs_evict_inode(struct inode *inode); extern struct inode *affs_iget(struct super_block *sb, unsigned long ino); extern int affs_write_inode(struct inode *inode, struct writeback_control *wbc); extern int affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s32 type); /* file.c */ void affs_free_prealloc(struct inode *inode); extern void affs_truncate(struct inode *); int affs_file_fsync(struct file *, loff_t, loff_t, int); /* dir.c */ extern void affs_dir_truncate(struct inode *); /* jump tables */ extern const struct inode_operations affs_file_inode_operations; extern const struct inode_operations affs_dir_inode_operations; extern const struct inode_operations affs_symlink_inode_operations; extern const struct file_operations affs_file_operations; extern const struct file_operations affs_file_operations_ofs; extern const struct file_operations affs_dir_operations; extern const struct address_space_operations affs_symlink_aops; extern const struct address_space_operations affs_aops; extern const struct address_space_operations affs_aops_ofs; extern const struct dentry_operations affs_dentry_operations; extern const struct dentry_operations affs_intl_dentry_operations; static inline bool affs_validblock(struct super_block *sb, int block) { return(block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size); } static inline void affs_set_blocksize(struct super_block *sb, int size) { sb_set_blocksize(sb, size); } static inline struct buffer_head * affs_bread(struct super_block *sb, int block) { pr_debug("%s: %d\n", __func__, block); if (affs_validblock(sb, block)) return sb_bread(sb, block); return NULL; } static inline struct buffer_head * affs_getblk(struct super_block *sb, int block) { pr_debug("%s: %d\n", __func__, block); if (affs_validblock(sb, block)) return sb_getblk(sb, block); return NULL; } static inline struct buffer_head * affs_getzeroblk(struct super_block *sb, int block) { struct buffer_head *bh; pr_debug("%s: %d\n", __func__, block); if (affs_validblock(sb, block)) { bh = sb_getblk(sb, block); lock_buffer(bh); memset(bh->b_data, 0 , sb->s_blocksize); set_buffer_uptodate(bh); unlock_buffer(bh); return bh; } return NULL; } static inline struct buffer_head * affs_getemptyblk(struct super_block *sb, int block) { struct buffer_head *bh; pr_debug("%s: %d\n", __func__, block); if (affs_validblock(sb, block)) { bh = sb_getblk(sb, block); wait_on_buffer(bh); set_buffer_uptodate(bh); return bh; } return NULL; } static inline void affs_brelse(struct buffer_head *bh) { if (bh) pr_debug("%s: %lld\n", __func__, (long long) bh->b_blocknr); brelse(bh); } static inline void affs_adjust_checksum(struct buffer_head *bh, u32 val) { u32 tmp = be32_to_cpu(((__be32 *)bh->b_data)[5]); ((__be32 *)bh->b_data)[5] = cpu_to_be32(tmp - val); } static inline void affs_adjust_bitmapchecksum(struct buffer_head *bh, u32 val) { u32 tmp = be32_to_cpu(((__be32 *)bh->b_data)[0]); ((__be32 *)bh->b_data)[0] = cpu_to_be32(tmp - val); } static inline void affs_lock_link(struct inode *inode) { mutex_lock(&AFFS_I(inode)->i_link_lock); } static inline void affs_unlock_link(struct inode *inode) { mutex_unlock(&AFFS_I(inode)->i_link_lock); } static inline void affs_lock_dir(struct inode *inode) { mutex_lock_nested(&AFFS_I(inode)->i_hash_lock, SINGLE_DEPTH_NESTING); } static inline void affs_unlock_dir(struct inode *inode) { mutex_unlock(&AFFS_I(inode)->i_hash_lock); } static inline void affs_lock_ext(struct inode *inode) { mutex_lock(&AFFS_I(inode)->i_ext_lock); } static inline void affs_unlock_ext(struct inode *inode) { mutex_unlock(&AFFS_I(inode)->i_ext_lock); } |
| 5 6 6 6 6 6 5 6 6 6 6 6 6 6 6 6 6 6 5 5 5 1 1 1 1 1 6 6 6 6 6 6 6 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2018-2020 Christoph Hellwig. * * DMA operations that map physical memory directly without using an IOMMU. */ #include <linux/memblock.h> /* for max_pfn */ #include <linux/export.h> #include <linux/mm.h> #include <linux/dma-map-ops.h> #include <linux/scatterlist.h> #include <linux/pfn.h> #include <linux/vmalloc.h> #include <linux/set_memory.h> #include <linux/slab.h> #include <linux/pci-p2pdma.h> #include "direct.h" /* * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use * it for entirely different regions. In that case the arch code needs to * override the variable below for dma-direct to work properly. */ u64 zone_dma_limit __ro_after_init = DMA_BIT_MASK(24); static inline dma_addr_t phys_to_dma_direct(struct device *dev, phys_addr_t phys) { if (force_dma_unencrypted(dev)) return phys_to_dma_unencrypted(dev, phys); return phys_to_dma(dev, phys); } static inline struct page *dma_direct_to_page(struct device *dev, dma_addr_t dma_addr) { return pfn_to_page(PHYS_PFN(dma_to_phys(dev, dma_addr))); } u64 dma_direct_get_required_mask(struct device *dev) { phys_addr_t phys = (phys_addr_t)(max_pfn - 1) << PAGE_SHIFT; u64 max_dma = phys_to_dma_direct(dev, phys); return (1ULL << (fls64(max_dma) - 1)) * 2 - 1; } static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 *phys_limit) { u64 dma_limit = min_not_zero( dev->coherent_dma_mask, dev->bus_dma_limit); /* * Optimistically try the zone that the physical address mask falls * into first. If that returns memory that isn't actually addressable * we will fallback to the next lower zone and try again. * * Note that GFP_DMA32 and GFP_DMA are no ops without the corresponding * zones. */ *phys_limit = dma_to_phys(dev, dma_limit); if (*phys_limit <= zone_dma_limit) return GFP_DMA; if (*phys_limit <= DMA_BIT_MASK(32)) return GFP_DMA32; return 0; } bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) { dma_addr_t dma_addr = phys_to_dma_direct(dev, phys); if (dma_addr == DMA_MAPPING_ERROR) return false; return dma_addr + size - 1 <= min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit); } static int dma_set_decrypted(struct device *dev, void *vaddr, size_t size) { if (!force_dma_unencrypted(dev)) return 0; return set_memory_decrypted((unsigned long)vaddr, PFN_UP(size)); } static int dma_set_encrypted(struct device *dev, void *vaddr, size_t size) { int ret; if (!force_dma_unencrypted(dev)) return 0; ret = set_memory_encrypted((unsigned long)vaddr, PFN_UP(size)); if (ret) pr_warn_ratelimited("leaking DMA memory that can't be re-encrypted\n"); return ret; } static void __dma_direct_free_pages(struct device *dev, struct page *page, size_t size) { if (swiotlb_free(dev, page, size)) return; dma_free_contiguous(dev, page, size); } static struct page *dma_direct_alloc_swiotlb(struct device *dev, size_t size) { struct page *page = swiotlb_alloc(dev, size); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { swiotlb_free(dev, page, size); return NULL; } return page; } static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp_t gfp, bool allow_highmem) { int node = dev_to_node(dev); struct page *page; u64 phys_limit; WARN_ON_ONCE(!PAGE_ALIGNED(size)); if (is_swiotlb_for_alloc(dev)) return dma_direct_alloc_swiotlb(dev, size); gfp |= dma_direct_optimal_gfp_mask(dev, &phys_limit); page = dma_alloc_contiguous(dev, size, gfp); if (page) { if (dma_coherent_ok(dev, page_to_phys(page), size) && (allow_highmem || !PageHighMem(page))) return page; dma_free_contiguous(dev, page, size); } while ((page = alloc_pages_node(node, gfp, get_order(size))) && !dma_coherent_ok(dev, page_to_phys(page), size)) { __free_pages(page, get_order(size)); if (IS_ENABLED(CONFIG_ZONE_DMA32) && phys_limit < DMA_BIT_MASK(64) && !(gfp & (GFP_DMA32 | GFP_DMA))) gfp |= GFP_DMA32; else if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & GFP_DMA)) gfp = (gfp & ~GFP_DMA32) | GFP_DMA; else return NULL; } return page; } /* * Check if a potentially blocking operations needs to dip into the atomic * pools for the given device/gfp. */ static bool dma_direct_use_pool(struct device *dev, gfp_t gfp) { return !gfpflags_allow_blocking(gfp) && !is_swiotlb_for_alloc(dev); } static void *dma_direct_alloc_from_pool(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) { struct page *page; u64 phys_limit; void *ret; if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_DMA_COHERENT_POOL))) return NULL; gfp |= dma_direct_optimal_gfp_mask(dev, &phys_limit); page = dma_alloc_from_pool(dev, size, &ret, gfp, dma_coherent_ok); if (!page) return NULL; *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); return ret; } static void *dma_direct_alloc_no_mapping(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) { struct page *page; page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO, true); if (!page) return NULL; /* remove any dirty cache lines on the kernel alias */ if (!PageHighMem(page)) arch_dma_prep_coherent(page, size); /* return the page pointer as the opaque cookie */ *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); return page; } void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { bool remap = false, set_uncached = false; struct page *page; void *ret; size = PAGE_ALIGN(size); if (attrs & DMA_ATTR_NO_WARN) gfp |= __GFP_NOWARN; if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev)) return dma_direct_alloc_no_mapping(dev, size, dma_handle, gfp); if (!dev_is_dma_coherent(dev)) { if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_ALLOC) && !is_swiotlb_for_alloc(dev)) return arch_dma_alloc(dev, size, dma_handle, gfp, attrs); /* * If there is a global pool, always allocate from it for * non-coherent devices. */ if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL)) return dma_alloc_from_global_coherent(dev, size, dma_handle); /* * Otherwise we require the architecture to either be able to * mark arbitrary parts of the kernel direct mapping uncached, * or remapped it uncached. */ set_uncached = IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED); remap = IS_ENABLED(CONFIG_DMA_DIRECT_REMAP); if (!set_uncached && !remap) { pr_warn_once("coherent DMA allocations not supported on this platform.\n"); return NULL; } } /* * Remapping or decrypting memory may block, allocate the memory from * the atomic pools instead if we aren't allowed block. */ if ((remap || force_dma_unencrypted(dev)) && dma_direct_use_pool(dev, gfp)) return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); /* we always manually zero the memory once we are done */ page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO, true); if (!page) return NULL; /* * dma_alloc_contiguous can return highmem pages depending on a * combination the cma= arguments and per-arch setup. These need to be * remapped to return a kernel virtual address. */ if (PageHighMem(page)) { remap = true; set_uncached = false; } if (remap) { pgprot_t prot = dma_pgprot(dev, PAGE_KERNEL, attrs); if (force_dma_unencrypted(dev)) prot = pgprot_decrypted(prot); /* remove any dirty cache lines on the kernel alias */ arch_dma_prep_coherent(page, size); /* create a coherent mapping */ ret = dma_common_contiguous_remap(page, size, prot, __builtin_return_address(0)); if (!ret) goto out_free_pages; } else { ret = page_address(page); if (dma_set_decrypted(dev, ret, size)) goto out_leak_pages; } memset(ret, 0, size); if (set_uncached) { arch_dma_prep_coherent(page, size); ret = arch_dma_set_uncached(ret, size); if (IS_ERR(ret)) goto out_encrypt_pages; } *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); return ret; out_encrypt_pages: if (dma_set_encrypted(dev, page_address(page), size)) return NULL; out_free_pages: __dma_direct_free_pages(dev, page, size); return NULL; out_leak_pages: return NULL; } void dma_direct_free(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) { unsigned int page_order = get_order(size); if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev)) { /* cpu_addr is a struct page cookie, not a kernel address */ dma_free_contiguous(dev, cpu_addr, size); return; } if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_ALLOC) && !dev_is_dma_coherent(dev) && !is_swiotlb_for_alloc(dev)) { arch_dma_free(dev, size, cpu_addr, dma_addr, attrs); return; } if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) && !dev_is_dma_coherent(dev)) { if (!dma_release_from_global_coherent(page_order, cpu_addr)) WARN_ON_ONCE(1); return; } /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */ if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size))) return; if (is_vmalloc_addr(cpu_addr)) { vunmap(cpu_addr); } else { if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED)) arch_dma_clear_uncached(cpu_addr, size); if (dma_set_encrypted(dev, cpu_addr, size)) return; } __dma_direct_free_pages(dev, dma_direct_to_page(dev, dma_addr), size); } struct page *dma_direct_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) { struct page *page; void *ret; if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp)) return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); page = __dma_direct_alloc_pages(dev, size, gfp, false); if (!page) return NULL; ret = page_address(page); if (dma_set_decrypted(dev, ret, size)) goto out_leak_pages; memset(ret, 0, size); *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); return page; out_leak_pages: return NULL; } void dma_direct_free_pages(struct device *dev, size_t size, struct page *page, dma_addr_t dma_addr, enum dma_data_direction dir) { void *vaddr = page_address(page); /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */ if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && dma_free_from_pool(dev, vaddr, size)) return; if (dma_set_encrypted(dev, vaddr, size)) return; __dma_direct_free_pages(dev, page, size); } #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ defined(CONFIG_SWIOTLB) void dma_direct_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir) { struct scatterlist *sg; int i; for_each_sg(sgl, sg, nents, i) { phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg)); swiotlb_sync_single_for_device(dev, paddr, sg->length, dir); if (!dev_is_dma_coherent(dev)) arch_sync_dma_for_device(paddr, sg->length, dir); } } #endif #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \ defined(CONFIG_SWIOTLB) void dma_direct_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir) { struct scatterlist *sg; int i; for_each_sg(sgl, sg, nents, i) { phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg)); if (!dev_is_dma_coherent(dev)) arch_sync_dma_for_cpu(paddr, sg->length, dir); swiotlb_sync_single_for_cpu(dev, paddr, sg->length, dir); if (dir == DMA_FROM_DEVICE) arch_dma_mark_clean(paddr, sg->length); } if (!dev_is_dma_coherent(dev)) arch_sync_dma_for_cpu_all(); } /* * Unmaps segments, except for ones marked as pci_p2pdma which do not * require any further action as they contain a bus address. */ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs) { struct scatterlist *sg; int i; for_each_sg(sgl, sg, nents, i) { if (sg_dma_is_bus_address(sg)) sg_dma_unmark_bus_address(sg); else dma_direct_unmap_phys(dev, sg->dma_address, sg_dma_len(sg), dir, attrs); } } #endif int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs) { struct pci_p2pdma_map_state p2pdma_state = {}; struct scatterlist *sg; int i, ret; for_each_sg(sgl, sg, nents, i) { switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(sg))) { case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: /* * Any P2P mapping that traverses the PCI host bridge * must be mapped with CPU physical address and not PCI * bus addresses. */ break; case PCI_P2PDMA_MAP_NONE: sg->dma_address = dma_direct_map_phys(dev, sg_phys(sg), sg->length, dir, attrs); if (sg->dma_address == DMA_MAPPING_ERROR) { ret = -EIO; goto out_unmap; } break; case PCI_P2PDMA_MAP_BUS_ADDR: sg->dma_address = pci_p2pdma_bus_addr_map( p2pdma_state.mem, sg_phys(sg)); sg_dma_len(sg) = sg->length; sg_dma_mark_bus_address(sg); continue; default: ret = -EREMOTEIO; goto out_unmap; } sg_dma_len(sg) = sg->length; } return nents; out_unmap: dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); return ret; } int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) { struct page *page = dma_direct_to_page(dev, dma_addr); int ret; ret = sg_alloc_table(sgt, 1, GFP_KERNEL); if (!ret) sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); return ret; } bool dma_direct_can_mmap(struct device *dev) { return dev_is_dma_coherent(dev) || IS_ENABLED(CONFIG_DMA_NONCOHERENT_MMAP); } int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) { unsigned long user_count = vma_pages(vma); unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; unsigned long pfn = PHYS_PFN(dma_to_phys(dev, dma_addr)); int ret = -ENXIO; vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs); if (force_dma_unencrypted(dev)) vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) return ret; if (dma_mmap_from_global_coherent(vma, cpu_addr, size, &ret)) return ret; if (vma->vm_pgoff >= count || user_count > count - vma->vm_pgoff) return -ENXIO; return remap_pfn_range(vma, vma->vm_start, pfn + vma->vm_pgoff, user_count << PAGE_SHIFT, vma->vm_page_prot); } int dma_direct_supported(struct device *dev, u64 mask) { u64 min_mask = (max_pfn - 1) << PAGE_SHIFT; /* * Because 32-bit DMA masks are so common we expect every architecture * to be able to satisfy them - either by not supporting more physical * memory, or by providing a ZONE_DMA32. If neither is the case, the * architecture needs to use an IOMMU instead of the direct mapping. */ if (mask >= DMA_BIT_MASK(32)) return 1; /* * This check needs to be against the actual bit mask value, so use * phys_to_dma_unencrypted() here so that the SME encryption mask isn't * part of the check. */ if (IS_ENABLED(CONFIG_ZONE_DMA)) min_mask = min_t(u64, min_mask, zone_dma_limit); return mask >= phys_to_dma_unencrypted(dev, min_mask); } static const struct bus_dma_region *dma_find_range(struct device *dev, unsigned long start_pfn) { const struct bus_dma_region *m; for (m = dev->dma_range_map; PFN_DOWN(m->size); m++) { unsigned long cpu_start_pfn = PFN_DOWN(m->cpu_start); if (start_pfn >= cpu_start_pfn && start_pfn - cpu_start_pfn < PFN_DOWN(m->size)) return m; } return NULL; } /* * To check whether all ram resource ranges are covered by dma range map * Returns 0 when further check is needed * Returns 1 if there is some RAM range can't be covered by dma_range_map */ static int check_ram_in_range_map(unsigned long start_pfn, unsigned long nr_pages, void *data) { unsigned long end_pfn = start_pfn + nr_pages; struct device *dev = data; while (start_pfn < end_pfn) { const struct bus_dma_region *bdr; bdr = dma_find_range(dev, start_pfn); if (!bdr) return 1; start_pfn = PFN_DOWN(bdr->cpu_start) + PFN_DOWN(bdr->size); } return 0; } bool dma_direct_all_ram_mapped(struct device *dev) { if (!dev->dma_range_map) return true; return !walk_system_ram_range(0, PFN_DOWN(ULONG_MAX) + 1, dev, check_ram_in_range_map); } size_t dma_direct_max_mapping_size(struct device *dev) { /* If SWIOTLB is active, use its maximum mapping size */ if (is_swiotlb_active(dev) && (dma_addressing_limited(dev) || is_swiotlb_force_bounce(dev))) return swiotlb_max_mapping_size(dev); return SIZE_MAX; } bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr) { return !dev_is_dma_coherent(dev) || swiotlb_find_pool(dev, dma_to_phys(dev, dma_addr)); } /** * dma_direct_set_offset - Assign scalar offset for a single DMA range. * @dev: device pointer; needed to "own" the alloced memory. * @cpu_start: beginning of memory region covered by this offset. * @dma_start: beginning of DMA/PCI region covered by this offset. * @size: size of the region. * * This is for the simple case of a uniform offset which cannot * be discovered by "dma-ranges". * * It returns -ENOMEM if out of memory, -EINVAL if a map * already exists, 0 otherwise. * * Note: any call to this from a driver is a bug. The mapping needs * to be described by the device tree or other firmware interfaces. */ int dma_direct_set_offset(struct device *dev, phys_addr_t cpu_start, dma_addr_t dma_start, u64 size) { struct bus_dma_region *map; u64 offset = (u64)cpu_start - (u64)dma_start; if (dev->dma_range_map) { dev_err(dev, "attempt to add DMA range to existing map\n"); return -EINVAL; } if (!offset) return 0; map = kcalloc(2, sizeof(*map), GFP_KERNEL); if (!map) return -ENOMEM; map[0].cpu_start = cpu_start; map[0].dma_start = dma_start; map[0].size = size; dev->dma_range_map = map; return 0; } |
| 185 181 181 181 180 181 185 185 185 184 182 8 5 5 5 5 5 5 17 16 16 16 16 16 8 2 6 2 6 5 5 6 2 16 16 16 16 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 | // SPDX-License-Identifier: GPL-2.0-only #include <net/netdev_lock.h> #include "netlink.h" #include "common.h" #include "bitset.h" struct features_req_info { struct ethnl_req_info base; }; struct features_reply_data { struct ethnl_reply_data base; u32 hw[ETHTOOL_DEV_FEATURE_WORDS]; u32 wanted[ETHTOOL_DEV_FEATURE_WORDS]; u32 active[ETHTOOL_DEV_FEATURE_WORDS]; u32 nochange[ETHTOOL_DEV_FEATURE_WORDS]; u32 all[ETHTOOL_DEV_FEATURE_WORDS]; }; #define FEATURES_REPDATA(__reply_base) \ container_of(__reply_base, struct features_reply_data, base) const struct nla_policy ethnl_features_get_policy[] = { [ETHTOOL_A_FEATURES_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static void ethnl_features_to_bitmap32(u32 *dest, netdev_features_t src) { unsigned int i; for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; i++) dest[i] = src >> (32 * i); } static int features_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct features_reply_data *data = FEATURES_REPDATA(reply_base); struct net_device *dev = reply_base->dev; netdev_features_t all_features; ethnl_features_to_bitmap32(data->hw, dev->hw_features); ethnl_features_to_bitmap32(data->wanted, dev->wanted_features); ethnl_features_to_bitmap32(data->active, dev->features); ethnl_features_to_bitmap32(data->nochange, NETIF_F_NEVER_CHANGE); all_features = GENMASK_ULL(NETDEV_FEATURE_COUNT - 1, 0); ethnl_features_to_bitmap32(data->all, all_features); return 0; } static int features_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct features_reply_data *data = FEATURES_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; unsigned int len = 0; int ret; ret = ethnl_bitset32_size(data->hw, data->all, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) return ret; len += ret; ret = ethnl_bitset32_size(data->wanted, NULL, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) return ret; len += ret; ret = ethnl_bitset32_size(data->active, NULL, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) return ret; len += ret; ret = ethnl_bitset32_size(data->nochange, NULL, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) return ret; len += ret; return len; } static int features_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct features_reply_data *data = FEATURES_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; int ret; ret = ethnl_put_bitset32(skb, ETHTOOL_A_FEATURES_HW, data->hw, data->all, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) return ret; ret = ethnl_put_bitset32(skb, ETHTOOL_A_FEATURES_WANTED, data->wanted, NULL, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) return ret; ret = ethnl_put_bitset32(skb, ETHTOOL_A_FEATURES_ACTIVE, data->active, NULL, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) return ret; return ethnl_put_bitset32(skb, ETHTOOL_A_FEATURES_NOCHANGE, data->nochange, NULL, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); } const struct ethnl_request_ops ethnl_features_request_ops = { .request_cmd = ETHTOOL_MSG_FEATURES_GET, .reply_cmd = ETHTOOL_MSG_FEATURES_GET_REPLY, .hdr_attr = ETHTOOL_A_FEATURES_HEADER, .req_info_size = sizeof(struct features_req_info), .reply_data_size = sizeof(struct features_reply_data), .prepare_data = features_prepare_data, .reply_size = features_reply_size, .fill_reply = features_fill_reply, }; /* FEATURES_SET */ const struct nla_policy ethnl_features_set_policy[] = { [ETHTOOL_A_FEATURES_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_FEATURES_WANTED] = { .type = NLA_NESTED }, }; static void ethnl_features_to_bitmap(unsigned long *dest, netdev_features_t val) { const unsigned int words = BITS_TO_LONGS(NETDEV_FEATURE_COUNT); unsigned int i; for (i = 0; i < words; i++) dest[i] = (unsigned long)(val >> (i * BITS_PER_LONG)); } static netdev_features_t ethnl_bitmap_to_features(unsigned long *src) { const unsigned int nft_bits = sizeof(netdev_features_t) * BITS_PER_BYTE; const unsigned int words = BITS_TO_LONGS(NETDEV_FEATURE_COUNT); netdev_features_t ret = 0; unsigned int i; for (i = 0; i < words; i++) ret |= (netdev_features_t)(src[i]) << (i * BITS_PER_LONG); ret &= ~(netdev_features_t)0 >> (nft_bits - NETDEV_FEATURE_COUNT); return ret; } static int features_send_reply(struct net_device *dev, struct genl_info *info, const unsigned long *wanted, const unsigned long *wanted_mask, const unsigned long *active, const unsigned long *active_mask, bool compact) { struct sk_buff *rskb; void *reply_payload; int reply_len = 0; int ret; reply_len = ethnl_reply_header_size(); ret = ethnl_bitset_size(wanted, wanted_mask, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) goto err; reply_len += ret; ret = ethnl_bitset_size(active, active_mask, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) goto err; reply_len += ret; ret = -ENOMEM; rskb = ethnl_reply_init(reply_len, dev, ETHTOOL_MSG_FEATURES_SET_REPLY, ETHTOOL_A_FEATURES_HEADER, info, &reply_payload); if (!rskb) goto err; ret = ethnl_put_bitset(rskb, ETHTOOL_A_FEATURES_WANTED, wanted, wanted_mask, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) goto nla_put_failure; ret = ethnl_put_bitset(rskb, ETHTOOL_A_FEATURES_ACTIVE, active, active_mask, NETDEV_FEATURE_COUNT, netdev_features_strings, compact); if (ret < 0) goto nla_put_failure; genlmsg_end(rskb, reply_payload); ret = genlmsg_reply(rskb, info); return ret; nla_put_failure: nlmsg_free(rskb); WARN_ONCE(1, "calculated message payload length (%d) not sufficient\n", reply_len); err: GENL_SET_ERR_MSG(info, "failed to send reply message"); return ret; } int ethnl_set_features(struct sk_buff *skb, struct genl_info *info) { DECLARE_BITMAP(wanted_diff_mask, NETDEV_FEATURE_COUNT); DECLARE_BITMAP(active_diff_mask, NETDEV_FEATURE_COUNT); DECLARE_BITMAP(old_active, NETDEV_FEATURE_COUNT); DECLARE_BITMAP(old_wanted, NETDEV_FEATURE_COUNT); DECLARE_BITMAP(new_active, NETDEV_FEATURE_COUNT); DECLARE_BITMAP(new_wanted, NETDEV_FEATURE_COUNT); DECLARE_BITMAP(req_wanted, NETDEV_FEATURE_COUNT); DECLARE_BITMAP(req_mask, NETDEV_FEATURE_COUNT); struct ethnl_req_info req_info = {}; struct nlattr **tb = info->attrs; struct net_device *dev; bool mod; int ret; if (!tb[ETHTOOL_A_FEATURES_WANTED]) return -EINVAL; ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_FEATURES_HEADER], genl_info_net(info), info->extack, true); if (ret < 0) return ret; dev = req_info.dev; rtnl_lock(); netdev_lock_ops(dev); ret = ethnl_ops_begin(dev); if (ret < 0) goto out_unlock; ethnl_features_to_bitmap(old_active, dev->features); ethnl_features_to_bitmap(old_wanted, dev->wanted_features); ret = ethnl_parse_bitset(req_wanted, req_mask, NETDEV_FEATURE_COUNT, tb[ETHTOOL_A_FEATURES_WANTED], netdev_features_strings, info->extack); if (ret < 0) goto out_ops; if (ethnl_bitmap_to_features(req_mask) & ~NETIF_F_ETHTOOL_BITS) { GENL_SET_ERR_MSG(info, "attempt to change non-ethtool features"); ret = -EINVAL; goto out_ops; } /* set req_wanted bits not in req_mask from old_wanted */ bitmap_and(req_wanted, req_wanted, req_mask, NETDEV_FEATURE_COUNT); bitmap_andnot(new_wanted, old_wanted, req_mask, NETDEV_FEATURE_COUNT); bitmap_or(req_wanted, new_wanted, req_wanted, NETDEV_FEATURE_COUNT); if (!bitmap_equal(req_wanted, old_wanted, NETDEV_FEATURE_COUNT)) { dev->wanted_features &= ~dev->hw_features; dev->wanted_features |= ethnl_bitmap_to_features(req_wanted) & dev->hw_features; __netdev_update_features(dev); } ethnl_features_to_bitmap(new_active, dev->features); mod = !bitmap_equal(old_active, new_active, NETDEV_FEATURE_COUNT); ret = 0; if (!(req_info.flags & ETHTOOL_FLAG_OMIT_REPLY)) { bool compact = req_info.flags & ETHTOOL_FLAG_COMPACT_BITSETS; bitmap_xor(wanted_diff_mask, req_wanted, new_active, NETDEV_FEATURE_COUNT); bitmap_xor(active_diff_mask, old_active, new_active, NETDEV_FEATURE_COUNT); bitmap_and(wanted_diff_mask, wanted_diff_mask, req_mask, NETDEV_FEATURE_COUNT); bitmap_and(req_wanted, req_wanted, wanted_diff_mask, NETDEV_FEATURE_COUNT); bitmap_and(new_active, new_active, active_diff_mask, NETDEV_FEATURE_COUNT); ret = features_send_reply(dev, info, req_wanted, wanted_diff_mask, new_active, active_diff_mask, compact); } if (mod) netdev_features_change(dev); out_ops: ethnl_ops_complete(dev); out_unlock: netdev_unlock_ops(dev); rtnl_unlock(); ethnl_parse_header_dev_put(&req_info); return ret; } |
| 243 187 188 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Checksumming functions for IPv6 * * Authors: Jorge Cwik, <jorge@laser.satlink.net> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Borrows very liberally from tcp.c and ip.c, see those * files for more names. */ /* * Fixes: * * Ralf Baechle : generic ipv6 checksum * <ralf@waldorf-gmbh.de> */ #ifndef _CHECKSUM_IPV6_H #define _CHECKSUM_IPV6_H #include <asm/types.h> #include <asm/byteorder.h> #include <net/ip.h> #include <asm/checksum.h> #include <linux/in6.h> #include <linux/tcp.h> #include <linux/ipv6.h> #ifndef _HAVE_ARCH_IPV6_CSUM __sum16 csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr, __u32 len, __u8 proto, __wsum csum); #endif static inline __wsum ip6_compute_pseudo(struct sk_buff *skb, int proto) { return ~csum_unfold(csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->len, proto, 0)); } static __inline__ __sum16 tcp_v6_check(int len, const struct in6_addr *saddr, const struct in6_addr *daddr, __wsum base) { return csum_ipv6_magic(saddr, daddr, len, IPPROTO_TCP, base); } static inline void __tcp_v6_send_check(struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr) { struct tcphdr *th = tcp_hdr(skb); th->check = ~tcp_v6_check(skb->len, saddr, daddr, 0); skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); } static inline void tcp_v6_gso_csum_prep(struct sk_buff *skb) { struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct tcphdr *th = tcp_hdr(skb); ipv6h->payload_len = 0; th->check = ~tcp_v6_check(0, &ipv6h->saddr, &ipv6h->daddr, 0); } static inline __sum16 udp_v6_check(int len, const struct in6_addr *saddr, const struct in6_addr *daddr, __wsum base) { return csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP, base); } void udp6_set_csum(bool nocheck, struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr, int len); int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto); #endif |
| 2 1 1 1 1 1 1 1 3 2 4 3 3 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 | // SPDX-License-Identifier: GPL-2.0-only /* * keyspan_remote: USB driver for the Keyspan DMR * * Copyright (C) 2005 Zymeta Corporation - Michael Downey (downey@zymeta.com) * * This driver has been put together with the support of Innosys, Inc. * and Keyspan, Inc the manufacturers of the Keyspan USB DMR product. */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/usb/input.h> /* Parameters that can be passed to the driver. */ static int debug; module_param(debug, int, 0444); MODULE_PARM_DESC(debug, "Enable extra debug messages and information"); /* Vendor and product ids */ #define USB_KEYSPAN_VENDOR_ID 0x06CD #define USB_KEYSPAN_PRODUCT_UIA11 0x0202 /* Defines for converting the data from the remote. */ #define ZERO 0x18 #define ZERO_MASK 0x1F /* 5 bits for a 0 */ #define ONE 0x3C #define ONE_MASK 0x3F /* 6 bits for a 1 */ #define SYNC 0x3F80 #define SYNC_MASK 0x3FFF /* 14 bits for a SYNC sequence */ #define STOP 0x00 #define STOP_MASK 0x1F /* 5 bits for the STOP sequence */ #define GAP 0xFF #define RECV_SIZE 8 /* The UIA-11 type have a 8 byte limit. */ /* * Table that maps the 31 possible keycodes to input keys. * Currently there are 15 and 17 button models so RESERVED codes * are blank areas in the mapping. */ static const unsigned short keyspan_key_table[] = { KEY_RESERVED, /* 0 is just a place holder. */ KEY_RESERVED, KEY_STOP, KEY_PLAYCD, KEY_RESERVED, KEY_PREVIOUSSONG, KEY_REWIND, KEY_FORWARD, KEY_NEXTSONG, KEY_RESERVED, KEY_RESERVED, KEY_RESERVED, KEY_PAUSE, KEY_VOLUMEUP, KEY_RESERVED, KEY_RESERVED, KEY_RESERVED, KEY_VOLUMEDOWN, KEY_RESERVED, KEY_UP, KEY_RESERVED, KEY_MUTE, KEY_LEFT, KEY_ENTER, KEY_RIGHT, KEY_RESERVED, KEY_RESERVED, KEY_DOWN, KEY_RESERVED, KEY_KPASTERISK, KEY_RESERVED, KEY_MENU }; /* table of devices that work with this driver */ static const struct usb_device_id keyspan_table[] = { { USB_DEVICE(USB_KEYSPAN_VENDOR_ID, USB_KEYSPAN_PRODUCT_UIA11) }, { } /* Terminating entry */ }; /* Structure to store all the real stuff that a remote sends to us. */ struct keyspan_message { u16 system; u8 button; u8 toggle; }; /* Structure used for all the bit testing magic needed to be done. */ struct bit_tester { u32 tester; int len; int pos; int bits_left; u8 buffer[32]; }; /* Structure to hold all of our driver specific stuff */ struct usb_keyspan { char name[128]; char phys[64]; unsigned short keymap[ARRAY_SIZE(keyspan_key_table)]; struct usb_device *udev; struct input_dev *input; struct usb_interface *interface; struct usb_endpoint_descriptor *in_endpoint; struct urb* irq_urb; int open; dma_addr_t in_dma; unsigned char *in_buffer; /* variables used to parse messages from remote. */ struct bit_tester data; int stage; int toggle; }; static struct usb_driver keyspan_driver; /* * Debug routine that prints out what we've received from the remote. */ static void keyspan_print(struct usb_keyspan* dev) /*unsigned char* data)*/ { char codes[4 * RECV_SIZE]; int i; for (i = 0; i < RECV_SIZE; i++) snprintf(codes + i * 3, 4, "%02x ", dev->in_buffer[i]); dev_info(&dev->udev->dev, "%s\n", codes); } /* * Routine that manages the bit_tester structure. It makes sure that there are * at least bits_needed bits loaded into the tester. */ static int keyspan_load_tester(struct usb_keyspan* dev, int bits_needed) { if (dev->data.bits_left >= bits_needed) return 0; /* * Somehow we've missed the last message. The message will be repeated * though so it's not too big a deal */ if (dev->data.pos >= dev->data.len) { dev_dbg(&dev->interface->dev, "%s - Error ran out of data. pos: %d, len: %d\n", __func__, dev->data.pos, dev->data.len); return -1; } /* Load as much as we can into the tester. */ while ((dev->data.bits_left + 7 < (sizeof(dev->data.tester) * 8)) && (dev->data.pos < dev->data.len)) { dev->data.tester += (dev->data.buffer[dev->data.pos++] << dev->data.bits_left); dev->data.bits_left += 8; } return 0; } static void keyspan_report_button(struct usb_keyspan *remote, int button, int press) { struct input_dev *input = remote->input; input_event(input, EV_MSC, MSC_SCAN, button); input_report_key(input, remote->keymap[button], press); input_sync(input); } /* * Routine that handles all the logic needed to parse out the message from the remote. */ static void keyspan_check_data(struct usb_keyspan *remote) { int i; int found = 0; struct keyspan_message message; switch(remote->stage) { case 0: /* * In stage 0 we want to find the start of a message. The remote sends a 0xFF as filler. * So the first byte that isn't a FF should be the start of a new message. */ for (i = 0; i < RECV_SIZE && remote->in_buffer[i] == GAP; ++i); if (i < RECV_SIZE) { memcpy(remote->data.buffer, remote->in_buffer, RECV_SIZE); remote->data.len = RECV_SIZE; remote->data.pos = 0; remote->data.tester = 0; remote->data.bits_left = 0; remote->stage = 1; } break; case 1: /* * Stage 1 we should have 16 bytes and should be able to detect a * SYNC. The SYNC is 14 bits, 7 0's and then 7 1's. */ memcpy(remote->data.buffer + remote->data.len, remote->in_buffer, RECV_SIZE); remote->data.len += RECV_SIZE; found = 0; while ((remote->data.bits_left >= 14 || remote->data.pos < remote->data.len) && !found) { for (i = 0; i < 8; ++i) { if (keyspan_load_tester(remote, 14) != 0) { remote->stage = 0; return; } if ((remote->data.tester & SYNC_MASK) == SYNC) { remote->data.tester = remote->data.tester >> 14; remote->data.bits_left -= 14; found = 1; break; } else { remote->data.tester = remote->data.tester >> 1; --remote->data.bits_left; } } } if (!found) { remote->stage = 0; remote->data.len = 0; } else { remote->stage = 2; } break; case 2: /* * Stage 2 we should have 24 bytes which will be enough for a full * message. We need to parse out the system code, button code, * toggle code, and stop. */ memcpy(remote->data.buffer + remote->data.len, remote->in_buffer, RECV_SIZE); remote->data.len += RECV_SIZE; message.system = 0; for (i = 0; i < 9; i++) { keyspan_load_tester(remote, 6); if ((remote->data.tester & ZERO_MASK) == ZERO) { message.system = message.system << 1; remote->data.tester = remote->data.tester >> 5; remote->data.bits_left -= 5; } else if ((remote->data.tester & ONE_MASK) == ONE) { message.system = (message.system << 1) + 1; remote->data.tester = remote->data.tester >> 6; remote->data.bits_left -= 6; } else { dev_err(&remote->interface->dev, "%s - Unknown sequence found in system data.\n", __func__); remote->stage = 0; return; } } message.button = 0; for (i = 0; i < 5; i++) { keyspan_load_tester(remote, 6); if ((remote->data.tester & ZERO_MASK) == ZERO) { message.button = message.button << 1; remote->data.tester = remote->data.tester >> 5; remote->data.bits_left -= 5; } else if ((remote->data.tester & ONE_MASK) == ONE) { message.button = (message.button << 1) + 1; remote->data.tester = remote->data.tester >> 6; remote->data.bits_left -= 6; } else { dev_err(&remote->interface->dev, "%s - Unknown sequence found in button data.\n", __func__); remote->stage = 0; return; } } keyspan_load_tester(remote, 6); if ((remote->data.tester & ZERO_MASK) == ZERO) { message.toggle = 0; remote->data.tester = remote->data.tester >> 5; remote->data.bits_left -= 5; } else if ((remote->data.tester & ONE_MASK) == ONE) { message.toggle = 1; remote->data.tester = remote->data.tester >> 6; remote->data.bits_left -= 6; } else { dev_err(&remote->interface->dev, "%s - Error in message, invalid toggle.\n", __func__); remote->stage = 0; return; } keyspan_load_tester(remote, 5); if ((remote->data.tester & STOP_MASK) == STOP) { remote->data.tester = remote->data.tester >> 5; remote->data.bits_left -= 5; } else { dev_err(&remote->interface->dev, "Bad message received, no stop bit found.\n"); } dev_dbg(&remote->interface->dev, "%s found valid message: system: %d, button: %d, toggle: %d\n", __func__, message.system, message.button, message.toggle); if (message.toggle != remote->toggle) { keyspan_report_button(remote, message.button, 1); keyspan_report_button(remote, message.button, 0); remote->toggle = message.toggle; } remote->stage = 0; break; } } /* * Routine for sending all the initialization messages to the remote. */ static int keyspan_setup(struct usb_device* dev) { int retval = 0; retval = usb_control_msg(dev, usb_sndctrlpipe(dev, 0), 0x11, 0x40, 0x5601, 0x0, NULL, 0, USB_CTRL_SET_TIMEOUT); if (retval) { dev_dbg(&dev->dev, "%s - failed to set bit rate due to error: %d\n", __func__, retval); return(retval); } retval = usb_control_msg(dev, usb_sndctrlpipe(dev, 0), 0x44, 0x40, 0x0, 0x0, NULL, 0, USB_CTRL_SET_TIMEOUT); if (retval) { dev_dbg(&dev->dev, "%s - failed to set resume sensitivity due to error: %d\n", __func__, retval); return(retval); } retval = usb_control_msg(dev, usb_sndctrlpipe(dev, 0), 0x22, 0x40, 0x0, 0x0, NULL, 0, USB_CTRL_SET_TIMEOUT); if (retval) { dev_dbg(&dev->dev, "%s - failed to turn receive on due to error: %d\n", __func__, retval); return(retval); } dev_dbg(&dev->dev, "%s - Setup complete.\n", __func__); return(retval); } /* * Routine used to handle a new message that has come in. */ static void keyspan_irq_recv(struct urb *urb) { struct usb_keyspan *dev = urb->context; int retval; /* Check our status in case we need to bail out early. */ switch (urb->status) { case 0: break; /* Device went away so don't keep trying to read from it. */ case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: return; default: goto resubmit; } if (debug) keyspan_print(dev); keyspan_check_data(dev); resubmit: retval = usb_submit_urb(urb, GFP_ATOMIC); if (retval) dev_err(&dev->interface->dev, "%s - usb_submit_urb failed with result: %d\n", __func__, retval); } static int keyspan_open(struct input_dev *dev) { struct usb_keyspan *remote = input_get_drvdata(dev); remote->irq_urb->dev = remote->udev; if (usb_submit_urb(remote->irq_urb, GFP_KERNEL)) return -EIO; return 0; } static void keyspan_close(struct input_dev *dev) { struct usb_keyspan *remote = input_get_drvdata(dev); usb_kill_urb(remote->irq_urb); } static struct usb_endpoint_descriptor *keyspan_get_in_endpoint(struct usb_host_interface *iface) { struct usb_endpoint_descriptor *endpoint; int i; for (i = 0; i < iface->desc.bNumEndpoints; ++i) { endpoint = &iface->endpoint[i].desc; if (usb_endpoint_is_int_in(endpoint)) { /* we found our interrupt in endpoint */ return endpoint; } } return NULL; } /* * Routine that sets up the driver to handle a specific USB device detected on the bus. */ static int keyspan_probe(struct usb_interface *interface, const struct usb_device_id *id) { struct usb_device *udev = interface_to_usbdev(interface); struct usb_endpoint_descriptor *endpoint; struct usb_keyspan *remote; struct input_dev *input_dev; int i, error; endpoint = keyspan_get_in_endpoint(interface->cur_altsetting); if (!endpoint) return -ENODEV; remote = kzalloc(sizeof(*remote), GFP_KERNEL); input_dev = input_allocate_device(); if (!remote || !input_dev) { error = -ENOMEM; goto fail1; } remote->udev = udev; remote->input = input_dev; remote->interface = interface; remote->in_endpoint = endpoint; remote->toggle = -1; /* Set to -1 so we will always not match the toggle from the first remote message. */ remote->in_buffer = usb_alloc_coherent(udev, RECV_SIZE, GFP_KERNEL, &remote->in_dma); if (!remote->in_buffer) { error = -ENOMEM; goto fail1; } remote->irq_urb = usb_alloc_urb(0, GFP_KERNEL); if (!remote->irq_urb) { error = -ENOMEM; goto fail2; } error = keyspan_setup(udev); if (error) { error = -ENODEV; goto fail3; } if (udev->manufacturer) strscpy(remote->name, udev->manufacturer, sizeof(remote->name)); if (udev->product) { if (udev->manufacturer) strlcat(remote->name, " ", sizeof(remote->name)); strlcat(remote->name, udev->product, sizeof(remote->name)); } if (!strlen(remote->name)) snprintf(remote->name, sizeof(remote->name), "USB Keyspan Remote %04x:%04x", le16_to_cpu(udev->descriptor.idVendor), le16_to_cpu(udev->descriptor.idProduct)); usb_make_path(udev, remote->phys, sizeof(remote->phys)); strlcat(remote->phys, "/input0", sizeof(remote->phys)); memcpy(remote->keymap, keyspan_key_table, sizeof(remote->keymap)); input_dev->name = remote->name; input_dev->phys = remote->phys; usb_to_input_id(udev, &input_dev->id); input_dev->dev.parent = &interface->dev; input_dev->keycode = remote->keymap; input_dev->keycodesize = sizeof(unsigned short); input_dev->keycodemax = ARRAY_SIZE(remote->keymap); input_set_capability(input_dev, EV_MSC, MSC_SCAN); __set_bit(EV_KEY, input_dev->evbit); for (i = 0; i < ARRAY_SIZE(keyspan_key_table); i++) __set_bit(keyspan_key_table[i], input_dev->keybit); __clear_bit(KEY_RESERVED, input_dev->keybit); input_set_drvdata(input_dev, remote); input_dev->open = keyspan_open; input_dev->close = keyspan_close; /* * Initialize the URB to access the device. * The urb gets sent to the device in keyspan_open() */ usb_fill_int_urb(remote->irq_urb, remote->udev, usb_rcvintpipe(remote->udev, endpoint->bEndpointAddress), remote->in_buffer, RECV_SIZE, keyspan_irq_recv, remote, endpoint->bInterval); remote->irq_urb->transfer_dma = remote->in_dma; remote->irq_urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; /* we can register the device now, as it is ready */ error = input_register_device(remote->input); if (error) goto fail3; /* save our data pointer in this interface device */ usb_set_intfdata(interface, remote); return 0; fail3: usb_free_urb(remote->irq_urb); fail2: usb_free_coherent(udev, RECV_SIZE, remote->in_buffer, remote->in_dma); fail1: kfree(remote); input_free_device(input_dev); return error; } /* * Routine called when a device is disconnected from the USB. */ static void keyspan_disconnect(struct usb_interface *interface) { struct usb_keyspan *remote; remote = usb_get_intfdata(interface); usb_set_intfdata(interface, NULL); if (remote) { /* We have a valid driver structure so clean up everything we allocated. */ input_unregister_device(remote->input); usb_kill_urb(remote->irq_urb); usb_free_urb(remote->irq_urb); usb_free_coherent(remote->udev, RECV_SIZE, remote->in_buffer, remote->in_dma); kfree(remote); } } /* * Standard driver set up sections */ static struct usb_driver keyspan_driver = { .name = "keyspan_remote", .probe = keyspan_probe, .disconnect = keyspan_disconnect, .id_table = keyspan_table }; module_usb_driver(keyspan_driver); MODULE_DEVICE_TABLE(usb, keyspan_table); MODULE_AUTHOR("Michael Downey <downey@zymeta.com>"); MODULE_DESCRIPTION("Driver for the USB Keyspan remote control."); MODULE_LICENSE("GPL"); |
| 269 186 9287 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 | /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) */ /* Copyright (C) 2016-2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. * * SipHash: a fast short-input PRF * https://131002.net/siphash/ * * This implementation is specifically for SipHash2-4 for a secure PRF * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for * hashtables. */ #ifndef _LINUX_SIPHASH_H #define _LINUX_SIPHASH_H #include <linux/types.h> #include <linux/kernel.h> #define SIPHASH_ALIGNMENT __alignof__(u64) typedef struct { u64 key[2]; } siphash_key_t; #define siphash_aligned_key_t siphash_key_t __aligned(16) static inline bool siphash_key_is_zero(const siphash_key_t *key) { return !(key->key[0] | key->key[1]); } u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key); u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key); u64 siphash_1u64(const u64 a, const siphash_key_t *key); u64 siphash_2u64(const u64 a, const u64 b, const siphash_key_t *key); u64 siphash_3u64(const u64 a, const u64 b, const u64 c, const siphash_key_t *key); u64 siphash_4u64(const u64 a, const u64 b, const u64 c, const u64 d, const siphash_key_t *key); u64 siphash_1u32(const u32 a, const siphash_key_t *key); u64 siphash_3u32(const u32 a, const u32 b, const u32 c, const siphash_key_t *key); static inline u64 siphash_2u32(const u32 a, const u32 b, const siphash_key_t *key) { return siphash_1u64((u64)b << 32 | a, key); } static inline u64 siphash_4u32(const u32 a, const u32 b, const u32 c, const u32 d, const siphash_key_t *key) { return siphash_2u64((u64)b << 32 | a, (u64)d << 32 | c, key); } static inline u64 ___siphash_aligned(const __le64 *data, size_t len, const siphash_key_t *key) { if (__builtin_constant_p(len) && len == 4) return siphash_1u32(le32_to_cpup((const __le32 *)data), key); if (__builtin_constant_p(len) && len == 8) return siphash_1u64(le64_to_cpu(data[0]), key); if (__builtin_constant_p(len) && len == 16) return siphash_2u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]), key); if (__builtin_constant_p(len) && len == 24) return siphash_3u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]), le64_to_cpu(data[2]), key); if (__builtin_constant_p(len) && len == 32) return siphash_4u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]), le64_to_cpu(data[2]), le64_to_cpu(data[3]), key); return __siphash_aligned(data, len, key); } /** * siphash - compute 64-bit siphash PRF value * @data: buffer to hash * @size: size of @data * @key: the siphash key */ static inline u64 siphash(const void *data, size_t len, const siphash_key_t *key) { if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || !IS_ALIGNED((unsigned long)data, SIPHASH_ALIGNMENT)) return __siphash_unaligned(data, len, key); return ___siphash_aligned(data, len, key); } #define HSIPHASH_ALIGNMENT __alignof__(unsigned long) typedef struct { unsigned long key[2]; } hsiphash_key_t; u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key); u32 __hsiphash_unaligned(const void *data, size_t len, const hsiphash_key_t *key); u32 hsiphash_1u32(const u32 a, const hsiphash_key_t *key); u32 hsiphash_2u32(const u32 a, const u32 b, const hsiphash_key_t *key); u32 hsiphash_3u32(const u32 a, const u32 b, const u32 c, const hsiphash_key_t *key); u32 hsiphash_4u32(const u32 a, const u32 b, const u32 c, const u32 d, const hsiphash_key_t *key); static inline u32 ___hsiphash_aligned(const __le32 *data, size_t len, const hsiphash_key_t *key) { if (__builtin_constant_p(len) && len == 4) return hsiphash_1u32(le32_to_cpu(data[0]), key); if (__builtin_constant_p(len) && len == 8) return hsiphash_2u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]), key); if (__builtin_constant_p(len) && len == 12) return hsiphash_3u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]), le32_to_cpu(data[2]), key); if (__builtin_constant_p(len) && len == 16) return hsiphash_4u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]), le32_to_cpu(data[2]), le32_to_cpu(data[3]), key); return __hsiphash_aligned(data, len, key); } /** * hsiphash - compute 32-bit hsiphash PRF value * @data: buffer to hash * @size: size of @data * @key: the hsiphash key */ static inline u32 hsiphash(const void *data, size_t len, const hsiphash_key_t *key) { if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || !IS_ALIGNED((unsigned long)data, HSIPHASH_ALIGNMENT)) return __hsiphash_unaligned(data, len, key); return ___hsiphash_aligned(data, len, key); } /* * These macros expose the raw SipHash and HalfSipHash permutations. * Do not use them directly! If you think you have a use for them, * be sure to CC the maintainer of this file explaining why. */ #define SIPHASH_PERMUTATION(a, b, c, d) ( \ (a) += (b), (b) = rol64((b), 13), (b) ^= (a), (a) = rol64((a), 32), \ (c) += (d), (d) = rol64((d), 16), (d) ^= (c), \ (a) += (d), (d) = rol64((d), 21), (d) ^= (a), \ (c) += (b), (b) = rol64((b), 17), (b) ^= (c), (c) = rol64((c), 32)) #define SIPHASH_CONST_0 0x736f6d6570736575ULL #define SIPHASH_CONST_1 0x646f72616e646f6dULL #define SIPHASH_CONST_2 0x6c7967656e657261ULL #define SIPHASH_CONST_3 0x7465646279746573ULL #define HSIPHASH_PERMUTATION(a, b, c, d) ( \ (a) += (b), (b) = rol32((b), 5), (b) ^= (a), (a) = rol32((a), 16), \ (c) += (d), (d) = rol32((d), 8), (d) ^= (c), \ (a) += (d), (d) = rol32((d), 7), (d) ^= (a), \ (c) += (b), (b) = rol32((b), 13), (b) ^= (c), (c) = rol32((c), 16)) #define HSIPHASH_CONST_0 0U #define HSIPHASH_CONST_1 0U #define HSIPHASH_CONST_2 0x6c796765U #define HSIPHASH_CONST_3 0x74656462U #endif /* _LINUX_SIPHASH_H */ |
| 13 13 13 13 15 6 13 13 13 13 13 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 | // SPDX-License-Identifier: GPL-2.0+ /* * linux/fs/jbd2/revoke.c * * Written by Stephen C. Tweedie <sct@redhat.com>, 2000 * * Copyright 2000 Red Hat corp --- All Rights Reserved * * Journal revoke routines for the generic filesystem journaling code; * part of the ext2fs journaling system. * * Revoke is the mechanism used to prevent old log records for deleted * metadata from being replayed on top of newer data using the same * blocks. The revoke mechanism is used in two separate places: * * + Commit: during commit we write the entire list of the current * transaction's revoked blocks to the journal * * + Recovery: during recovery we record the transaction ID of all * revoked blocks. If there are multiple revoke records in the log * for a single block, only the last one counts, and if there is a log * entry for a block beyond the last revoke, then that log entry still * gets replayed. * * We can get interactions between revokes and new log data within a * single transaction: * * Block is revoked and then journaled: * The desired end result is the journaling of the new block, so we * cancel the revoke before the transaction commits. * * Block is journaled and then revoked: * The revoke must take precedence over the write of the block, so we * need either to cancel the journal entry or to write the revoke * later in the log than the log block. In this case, we choose the * latter: journaling a block cancels any revoke record for that block * in the current transaction, so any revoke for that block in the * transaction must have happened after the block was journaled and so * the revoke must take precedence. * * Block is revoked and then written as data: * The data write is allowed to succeed, but the revoke is _not_ * cancelled. We still need to prevent old log records from * overwriting the new data. We don't even need to clear the revoke * bit here. * * We cache revoke status of a buffer in the current transaction in b_states * bits. As the name says, revokevalid flag indicates that the cached revoke * status of a buffer is valid and we can rely on the cached status. * * Revoke information on buffers is a tri-state value: * * RevokeValid clear: no cached revoke status, need to look it up * RevokeValid set, Revoked clear: * buffer has not been revoked, and cancel_revoke * need do nothing. * RevokeValid set, Revoked set: * buffer has been revoked. * * Locking rules: * We keep two hash tables of revoke records. One hashtable belongs to the * running transaction (is pointed to by journal->j_revoke), the other one * belongs to the committing transaction. Accesses to the second hash table * happen only from the kjournald and no other thread touches this table. Also * journal_switch_revoke_table() which switches which hashtable belongs to the * running and which to the committing transaction is called only from * kjournald. Therefore we need no locks when accessing the hashtable belonging * to the committing transaction. * * All users operating on the hash table belonging to the running transaction * have a handle to the transaction. Therefore they are safe from kjournald * switching hash tables under them. For operations on the lists of entries in * the hash table j_revoke_lock is used. * * Finally, also replay code uses the hash tables but at this moment no one else * can touch them (filesystem isn't mounted yet) and hence no locking is * needed. */ #ifndef __KERNEL__ #include "jfs_user.h" #else #include <linux/time.h> #include <linux/fs.h> #include <linux/jbd2.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/init.h> #include <linux/bio.h> #include <linux/log2.h> #include <linux/hash.h> #endif static struct kmem_cache *jbd2_revoke_record_cache; static struct kmem_cache *jbd2_revoke_table_cache; /* Each revoke record represents one single revoked block. During journal replay, this involves recording the transaction ID of the last transaction to revoke this block. */ struct jbd2_revoke_record_s { struct list_head hash; tid_t sequence; /* Used for recovery only */ unsigned long long blocknr; }; /* The revoke table is just a simple hash table of revoke records. */ struct jbd2_revoke_table_s { /* It is conceivable that we might want a larger hash table * for recovery. Must be a power of two. */ int hash_size; int hash_shift; struct list_head *hash_table; }; #ifdef __KERNEL__ static void write_one_revoke_record(transaction_t *, struct list_head *, struct buffer_head **, int *, struct jbd2_revoke_record_s *); static void flush_descriptor(journal_t *, struct buffer_head *, int); #endif /* Utility functions to maintain the revoke table */ static inline int hash(journal_t *journal, unsigned long long block) { return hash_64(block, journal->j_revoke->hash_shift); } static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr, tid_t seq) { struct list_head *hash_list; struct jbd2_revoke_record_s *record; gfp_t gfp_mask = GFP_NOFS; if (journal_oom_retry) gfp_mask |= __GFP_NOFAIL; record = kmem_cache_alloc(jbd2_revoke_record_cache, gfp_mask); if (!record) return -ENOMEM; record->sequence = seq; record->blocknr = blocknr; hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; spin_lock(&journal->j_revoke_lock); list_add(&record->hash, hash_list); spin_unlock(&journal->j_revoke_lock); return 0; } /* Find a revoke record in the journal's hash table. */ static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal, unsigned long long blocknr) { struct list_head *hash_list; struct jbd2_revoke_record_s *record; hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; spin_lock(&journal->j_revoke_lock); record = (struct jbd2_revoke_record_s *) hash_list->next; while (&(record->hash) != hash_list) { if (record->blocknr == blocknr) { spin_unlock(&journal->j_revoke_lock); return record; } record = (struct jbd2_revoke_record_s *) record->hash.next; } spin_unlock(&journal->j_revoke_lock); return NULL; } void jbd2_journal_destroy_revoke_record_cache(void) { kmem_cache_destroy(jbd2_revoke_record_cache); jbd2_revoke_record_cache = NULL; } void jbd2_journal_destroy_revoke_table_cache(void) { kmem_cache_destroy(jbd2_revoke_table_cache); jbd2_revoke_table_cache = NULL; } int __init jbd2_journal_init_revoke_record_cache(void) { J_ASSERT(!jbd2_revoke_record_cache); jbd2_revoke_record_cache = KMEM_CACHE(jbd2_revoke_record_s, SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY); if (!jbd2_revoke_record_cache) { pr_emerg("JBD2: failed to create revoke_record cache\n"); return -ENOMEM; } return 0; } int __init jbd2_journal_init_revoke_table_cache(void) { J_ASSERT(!jbd2_revoke_table_cache); jbd2_revoke_table_cache = KMEM_CACHE(jbd2_revoke_table_s, SLAB_TEMPORARY); if (!jbd2_revoke_table_cache) { pr_emerg("JBD2: failed to create revoke_table cache\n"); return -ENOMEM; } return 0; } struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size) { int shift = 0; int tmp = hash_size; struct jbd2_revoke_table_s *table; table = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL); if (!table) goto out; while((tmp >>= 1UL) != 0UL) shift++; table->hash_size = hash_size; table->hash_shift = shift; table->hash_table = kvmalloc_array(hash_size, sizeof(struct list_head), GFP_KERNEL); if (!table->hash_table) { kmem_cache_free(jbd2_revoke_table_cache, table); table = NULL; goto out; } for (tmp = 0; tmp < hash_size; tmp++) INIT_LIST_HEAD(&table->hash_table[tmp]); out: return table; } void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table) { int i; struct list_head *hash_list; for (i = 0; i < table->hash_size; i++) { hash_list = &table->hash_table[i]; J_ASSERT(list_empty(hash_list)); } kvfree(table->hash_table); kmem_cache_free(jbd2_revoke_table_cache, table); } /* Initialise the revoke table for a given journal to a given size. */ int jbd2_journal_init_revoke(journal_t *journal, int hash_size) { J_ASSERT(journal->j_revoke_table[0] == NULL); J_ASSERT(is_power_of_2(hash_size)); journal->j_revoke_table[0] = jbd2_journal_init_revoke_table(hash_size); if (!journal->j_revoke_table[0]) goto fail0; journal->j_revoke_table[1] = jbd2_journal_init_revoke_table(hash_size); if (!journal->j_revoke_table[1]) goto fail1; journal->j_revoke = journal->j_revoke_table[1]; spin_lock_init(&journal->j_revoke_lock); return 0; fail1: jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]); journal->j_revoke_table[0] = NULL; fail0: return -ENOMEM; } /* Destroy a journal's revoke table. The table must already be empty! */ void jbd2_journal_destroy_revoke(journal_t *journal) { journal->j_revoke = NULL; if (journal->j_revoke_table[0]) jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]); if (journal->j_revoke_table[1]) jbd2_journal_destroy_revoke_table(journal->j_revoke_table[1]); } #ifdef __KERNEL__ /* * jbd2_journal_revoke: revoke a given buffer_head from the journal. This * prevents the block from being replayed during recovery if we take a * crash after this current transaction commits. Any subsequent * metadata writes of the buffer in this transaction cancel the * revoke. * * Note that this call may block --- it is up to the caller to make * sure that there are no further calls to journal_write_metadata * before the revoke is complete. In ext3, this implies calling the * revoke before clearing the block bitmap when we are deleting * metadata. * * Revoke performs a jbd2_journal_forget on any buffer_head passed in as a * parameter, but does _not_ forget the buffer_head if the bh was only * found implicitly. * * bh_in may not be a journalled buffer - it may have come off * the hash tables without an attached journal_head. * * If bh_in is non-zero, jbd2_journal_revoke() will decrement its b_count * by one. */ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr, struct buffer_head *bh_in) { struct buffer_head *bh = NULL; journal_t *journal; struct block_device *bdev; int err; might_sleep(); if (bh_in) BUFFER_TRACE(bh_in, "enter"); journal = handle->h_transaction->t_journal; if (!jbd2_journal_set_features(journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)){ J_ASSERT (!"Cannot set revoke feature!"); return -EINVAL; } bdev = journal->j_fs_dev; bh = bh_in; if (!bh) { bh = __find_get_block_nonatomic(bdev, blocknr, journal->j_blocksize); if (bh) BUFFER_TRACE(bh, "found on hash"); } #ifdef JBD2_EXPENSIVE_CHECKING else { struct buffer_head *bh2; /* If there is a different buffer_head lying around in * memory anywhere... */ bh2 = __find_get_block_nonatomic(bdev, blocknr, journal->j_blocksize); if (bh2) { /* ... and it has RevokeValid status... */ if (bh2 != bh && buffer_revokevalid(bh2)) /* ...then it better be revoked too, * since it's illegal to create a revoke * record against a buffer_head which is * not marked revoked --- that would * risk missing a subsequent revoke * cancel. */ J_ASSERT_BH(bh2, buffer_revoked(bh2)); put_bh(bh2); } } #endif if (WARN_ON_ONCE(handle->h_revoke_credits <= 0)) { if (!bh_in) brelse(bh); return -EIO; } /* We really ought not ever to revoke twice in a row without first having the revoke cancelled: it's illegal to free a block twice without allocating it in between! */ if (bh) { if (!J_EXPECT_BH(bh, !buffer_revoked(bh), "inconsistent data on disk")) { if (!bh_in) brelse(bh); return -EIO; } set_buffer_revoked(bh); set_buffer_revokevalid(bh); if (bh_in) { BUFFER_TRACE(bh_in, "call jbd2_journal_forget"); jbd2_journal_forget(handle, bh_in); } else { BUFFER_TRACE(bh, "call brelse"); __brelse(bh); } } handle->h_revoke_credits--; jbd2_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in); err = insert_revoke_hash(journal, blocknr, handle->h_transaction->t_tid); BUFFER_TRACE(bh_in, "exit"); return err; } /* * Cancel an outstanding revoke. For use only internally by the * journaling code (called from jbd2_journal_get_write_access). * * We trust buffer_revoked() on the buffer if the buffer is already * being journaled: if there is no revoke pending on the buffer, then we * don't do anything here. * * This would break if it were possible for a buffer to be revoked and * discarded, and then reallocated within the same transaction. In such * a case we would have lost the revoked bit, but when we arrived here * the second time we would still have a pending revoke to cancel. So, * do not trust the Revoked bit on buffers unless RevokeValid is also * set. */ void jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) { struct jbd2_revoke_record_s *record; journal_t *journal = handle->h_transaction->t_journal; int need_cancel; struct buffer_head *bh = jh2bh(jh); jbd2_debug(4, "journal_head %p, cancelling revoke\n", jh); /* Is the existing Revoke bit valid? If so, we trust it, and * only perform the full cancel if the revoke bit is set. If * not, we can't trust the revoke bit, and we need to do the * full search for a revoke record. */ if (test_set_buffer_revokevalid(bh)) { need_cancel = test_clear_buffer_revoked(bh); } else { need_cancel = 1; clear_buffer_revoked(bh); } if (need_cancel) { record = find_revoke_record(journal, bh->b_blocknr); if (record) { jbd2_debug(4, "cancelled existing revoke on " "blocknr %llu\n", (unsigned long long)bh->b_blocknr); spin_lock(&journal->j_revoke_lock); list_del(&record->hash); spin_unlock(&journal->j_revoke_lock); kmem_cache_free(jbd2_revoke_record_cache, record); } } #ifdef JBD2_EXPENSIVE_CHECKING /* There better not be one left behind by now! */ record = find_revoke_record(journal, bh->b_blocknr); J_ASSERT_JH(jh, record == NULL); #endif /* Finally, have we just cleared revoke on an unhashed * buffer_head? If so, we'd better make sure we clear the * revoked status on any hashed alias too, otherwise the revoke * state machine will get very upset later on. */ if (need_cancel) { struct buffer_head *bh2; bh2 = __find_get_block_nonatomic(bh->b_bdev, bh->b_blocknr, bh->b_size); if (bh2) { if (bh2 != bh) clear_buffer_revoked(bh2); __brelse(bh2); } } } /* * jbd2_clear_buffer_revoked_flags clears revoked flag of buffers in * revoke table to reflect there is no revoked buffers in the next * transaction which is going to be started. */ void jbd2_clear_buffer_revoked_flags(journal_t *journal) { struct jbd2_revoke_table_s *revoke = journal->j_revoke; int i = 0; for (i = 0; i < revoke->hash_size; i++) { struct list_head *hash_list; struct list_head *list_entry; hash_list = &revoke->hash_table[i]; list_for_each(list_entry, hash_list) { struct jbd2_revoke_record_s *record; struct buffer_head *bh; record = (struct jbd2_revoke_record_s *)list_entry; bh = __find_get_block_nonatomic(journal->j_fs_dev, record->blocknr, journal->j_blocksize); if (bh) { clear_buffer_revoked(bh); __brelse(bh); } } } } /* jbd2_journal_switch_revoke_table table select j_revoke for next * transaction we do not want to suspend any processing until all * revokes are written -bzzz */ void jbd2_journal_switch_revoke_table(journal_t *journal) { int i; if (journal->j_revoke == journal->j_revoke_table[0]) journal->j_revoke = journal->j_revoke_table[1]; else journal->j_revoke = journal->j_revoke_table[0]; for (i = 0; i < journal->j_revoke->hash_size; i++) INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]); } /* * Write revoke records to the journal for all entries in the current * revoke hash, deleting the entries as we go. */ void jbd2_journal_write_revoke_records(transaction_t *transaction, struct list_head *log_bufs) { journal_t *journal = transaction->t_journal; struct buffer_head *descriptor; struct jbd2_revoke_record_s *record; struct jbd2_revoke_table_s *revoke; struct list_head *hash_list; int i, offset, count; descriptor = NULL; offset = 0; count = 0; /* select revoke table for committing transaction */ revoke = journal->j_revoke == journal->j_revoke_table[0] ? journal->j_revoke_table[1] : journal->j_revoke_table[0]; for (i = 0; i < revoke->hash_size; i++) { hash_list = &revoke->hash_table[i]; while (!list_empty(hash_list)) { record = (struct jbd2_revoke_record_s *) hash_list->next; write_one_revoke_record(transaction, log_bufs, &descriptor, &offset, record); count++; list_del(&record->hash); kmem_cache_free(jbd2_revoke_record_cache, record); } } if (descriptor) flush_descriptor(journal, descriptor, offset); jbd2_debug(1, "Wrote %d revoke records\n", count); } /* * Write out one revoke record. We need to create a new descriptor * block if the old one is full or if we have not already created one. */ static void write_one_revoke_record(transaction_t *transaction, struct list_head *log_bufs, struct buffer_head **descriptorp, int *offsetp, struct jbd2_revoke_record_s *record) { journal_t *journal = transaction->t_journal; int csum_size = 0; struct buffer_head *descriptor; int sz, offset; /* If we are already aborting, this all becomes a noop. We still need to go round the loop in jbd2_journal_write_revoke_records in order to free all of the revoke records: only the IO to the journal is omitted. */ if (is_journal_aborted(journal)) return; descriptor = *descriptorp; offset = *offsetp; /* Do we need to leave space at the end for a checksum? */ if (jbd2_journal_has_csum_v2or3(journal)) csum_size = sizeof(struct jbd2_journal_block_tail); if (jbd2_has_feature_64bit(journal)) sz = 8; else sz = 4; /* Make sure we have a descriptor with space left for the record */ if (descriptor) { if (offset + sz > journal->j_blocksize - csum_size) { flush_descriptor(journal, descriptor, offset); descriptor = NULL; } } if (!descriptor) { descriptor = jbd2_journal_get_descriptor_buffer(transaction, JBD2_REVOKE_BLOCK); if (!descriptor) return; /* Record it so that we can wait for IO completion later */ BUFFER_TRACE(descriptor, "file in log_bufs"); jbd2_file_log_bh(log_bufs, descriptor); offset = sizeof(jbd2_journal_revoke_header_t); *descriptorp = descriptor; } if (jbd2_has_feature_64bit(journal)) * ((__be64 *)(&descriptor->b_data[offset])) = cpu_to_be64(record->blocknr); else * ((__be32 *)(&descriptor->b_data[offset])) = cpu_to_be32(record->blocknr); offset += sz; *offsetp = offset; } /* * Flush a revoke descriptor out to the journal. If we are aborting, * this is a noop; otherwise we are generating a buffer which needs to * be waited for during commit, so it has to go onto the appropriate * journal buffer list. */ static void flush_descriptor(journal_t *journal, struct buffer_head *descriptor, int offset) { jbd2_journal_revoke_header_t *header; if (is_journal_aborted(journal)) return; header = (jbd2_journal_revoke_header_t *)descriptor->b_data; header->r_count = cpu_to_be32(offset); jbd2_descriptor_block_csum_set(journal, descriptor); set_buffer_jwrite(descriptor); BUFFER_TRACE(descriptor, "write"); set_buffer_dirty(descriptor); write_dirty_buffer(descriptor, JBD2_JOURNAL_REQ_FLAGS); } #endif /* * Revoke support for recovery. * * Recovery needs to be able to: * * record all revoke records, including the tid of the latest instance * of each revoke in the journal * * check whether a given block in a given transaction should be replayed * (ie. has not been revoked by a revoke record in that or a subsequent * transaction) * * empty the revoke table after recovery. */ /* * First, setting revoke records. We create a new revoke record for * every block ever revoked in the log as we scan it for recovery, and * we update the existing records if we find multiple revokes for a * single block. */ int jbd2_journal_set_revoke(journal_t *journal, unsigned long long blocknr, tid_t sequence) { struct jbd2_revoke_record_s *record; record = find_revoke_record(journal, blocknr); if (record) { /* If we have multiple occurrences, only record the * latest sequence number in the hashed record */ if (tid_gt(sequence, record->sequence)) record->sequence = sequence; return 0; } return insert_revoke_hash(journal, blocknr, sequence); } /* * Test revoke records. For a given block referenced in the log, has * that block been revoked? A revoke record with a given transaction * sequence number revokes all blocks in that transaction and earlier * ones, but later transactions still need replayed. */ int jbd2_journal_test_revoke(journal_t *journal, unsigned long long blocknr, tid_t sequence) { struct jbd2_revoke_record_s *record; record = find_revoke_record(journal, blocknr); if (!record) return 0; if (tid_gt(sequence, record->sequence)) return 0; return 1; } /* * Finally, once recovery is over, we need to clear the revoke table so * that it can be reused by the running filesystem. */ void jbd2_journal_clear_revoke(journal_t *journal) { int i; struct list_head *hash_list; struct jbd2_revoke_record_s *record; struct jbd2_revoke_table_s *revoke; revoke = journal->j_revoke; for (i = 0; i < revoke->hash_size; i++) { hash_list = &revoke->hash_table[i]; while (!list_empty(hash_list)) { record = (struct jbd2_revoke_record_s*) hash_list->next; list_del(&record->hash); kmem_cache_free(jbd2_revoke_record_cache, record); } } } |
| 3 5 5 1 4 4 4 2 4 1 3 3 5 1 1 1 154 147 10 1 1 1 142 1 7 6 2 2 2 2 1 1 1 1 1 1 1 2 4 4 4 4 4 4 9 8 7 9 5 5 1 4 4 4 4 4 3 2 2 3 2 2 2 2 1 1 1 1 1 7 6 6 6 10 165 165 165 165 165 1 335 333 63 33 13 13 4 3 3 3 3 2 1 2 4 3 2 2 2 2 5 4 3 3 3 2 8 5 16 1 166 9 2 334 11 948 949 949 178 176 163 151 941 711 709 710 3 711 4 711 2 1 156 154 154 154 1 218 1 20 20 1 10 142 144 145 708 178 175 169 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 | // SPDX-License-Identifier: GPL-2.0 #include <linux/kmod.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <linux/net_tstamp.h> #include <linux/phylib_stubs.h> #include <linux/ptp_clock_kernel.h> #include <linux/wireless.h> #include <linux/if_bridge.h> #include <net/dsa_stubs.h> #include <net/netdev_lock.h> #include <net/wext.h> #include "dev.h" /* * Map an interface index to its name (SIOCGIFNAME) */ /* * We need this ioctl for efficient implementation of the * if_indextoname() function required by the IPv6 API. Without * it, we would have to search all the interfaces to find a * match. --pb */ static int dev_ifname(struct net *net, struct ifreq *ifr) { ifr->ifr_name[IFNAMSIZ-1] = 0; return netdev_get_name(net, ifr->ifr_name, ifr->ifr_ifindex); } /* * Perform a SIOCGIFCONF call. This structure will change * size eventually, and there is nothing I can do about it. * Thus we will need a 'compatibility mode'. */ int dev_ifconf(struct net *net, struct ifconf __user *uifc) { struct net_device *dev; void __user *pos; size_t size; int len, total = 0, done; /* both the ifconf and the ifreq structures are slightly different */ if (in_compat_syscall()) { struct compat_ifconf ifc32; if (copy_from_user(&ifc32, uifc, sizeof(struct compat_ifconf))) return -EFAULT; pos = compat_ptr(ifc32.ifcbuf); len = ifc32.ifc_len; size = sizeof(struct compat_ifreq); } else { struct ifconf ifc; if (copy_from_user(&ifc, uifc, sizeof(struct ifconf))) return -EFAULT; pos = ifc.ifc_buf; len = ifc.ifc_len; size = sizeof(struct ifreq); } /* Loop over the interfaces, and write an info block for each. */ rtnl_net_lock(net); for_each_netdev(net, dev) { if (!pos) done = inet_gifconf(dev, NULL, 0, size); else done = inet_gifconf(dev, pos + total, len - total, size); if (done < 0) { rtnl_net_unlock(net); return -EFAULT; } total += done; } rtnl_net_unlock(net); return put_user(total, &uifc->ifc_len); } static int dev_getifmap(struct net_device *dev, struct ifreq *ifr) { struct ifmap *ifmap = &ifr->ifr_map; if (in_compat_syscall()) { struct compat_ifmap *cifmap = (struct compat_ifmap *)ifmap; cifmap->mem_start = dev->mem_start; cifmap->mem_end = dev->mem_end; cifmap->base_addr = dev->base_addr; cifmap->irq = dev->irq; cifmap->dma = dev->dma; cifmap->port = dev->if_port; return 0; } ifmap->mem_start = dev->mem_start; ifmap->mem_end = dev->mem_end; ifmap->base_addr = dev->base_addr; ifmap->irq = dev->irq; ifmap->dma = dev->dma; ifmap->port = dev->if_port; return 0; } static int netif_setifmap(struct net_device *dev, struct ifreq *ifr) { struct compat_ifmap *cifmap = (struct compat_ifmap *)&ifr->ifr_map; if (!dev->netdev_ops->ndo_set_config) return -EOPNOTSUPP; if (in_compat_syscall()) { struct ifmap ifmap = { .mem_start = cifmap->mem_start, .mem_end = cifmap->mem_end, .base_addr = cifmap->base_addr, .irq = cifmap->irq, .dma = cifmap->dma, .port = cifmap->port, }; return dev->netdev_ops->ndo_set_config(dev, &ifmap); } return dev->netdev_ops->ndo_set_config(dev, &ifr->ifr_map); } /* * Perform the SIOCxIFxxx calls, inside rcu_read_lock() */ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd) { int err; struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name); if (!dev) return -ENODEV; switch (cmd) { case SIOCGIFFLAGS: /* Get interface flags */ ifr->ifr_flags = (short)netif_get_flags(dev); return 0; case SIOCGIFMETRIC: /* Get the metric on the interface (currently unused) */ ifr->ifr_metric = 0; return 0; case SIOCGIFMTU: /* Get the MTU of a device */ ifr->ifr_mtu = dev->mtu; return 0; case SIOCGIFSLAVE: err = -EINVAL; break; case SIOCGIFMAP: return dev_getifmap(dev, ifr); case SIOCGIFINDEX: ifr->ifr_ifindex = dev->ifindex; return 0; case SIOCGIFTXQLEN: ifr->ifr_qlen = dev->tx_queue_len; return 0; default: /* dev_ioctl() should ensure this case * is never reached */ WARN_ON(1); err = -ENOTTY; break; } return err; } int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg) { enum hwtstamp_tx_types tx_type; enum hwtstamp_rx_filters rx_filter; int tx_type_valid = 0; int rx_filter_valid = 0; if (cfg->flags & ~HWTSTAMP_FLAG_MASK) return -EINVAL; tx_type = cfg->tx_type; rx_filter = cfg->rx_filter; switch (tx_type) { case HWTSTAMP_TX_OFF: case HWTSTAMP_TX_ON: case HWTSTAMP_TX_ONESTEP_SYNC: case HWTSTAMP_TX_ONESTEP_P2P: tx_type_valid = 1; break; case __HWTSTAMP_TX_CNT: /* not a real value */ break; } switch (rx_filter) { case HWTSTAMP_FILTER_NONE: case HWTSTAMP_FILTER_ALL: case HWTSTAMP_FILTER_SOME: case HWTSTAMP_FILTER_PTP_V1_L4_EVENT: case HWTSTAMP_FILTER_PTP_V1_L4_SYNC: case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ: case HWTSTAMP_FILTER_PTP_V2_L4_EVENT: case HWTSTAMP_FILTER_PTP_V2_L4_SYNC: case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ: case HWTSTAMP_FILTER_PTP_V2_L2_EVENT: case HWTSTAMP_FILTER_PTP_V2_L2_SYNC: case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ: case HWTSTAMP_FILTER_PTP_V2_EVENT: case HWTSTAMP_FILTER_PTP_V2_SYNC: case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: case HWTSTAMP_FILTER_NTP_ALL: rx_filter_valid = 1; break; case __HWTSTAMP_FILTER_CNT: /* not a real value */ break; } if (!tx_type_valid || !rx_filter_valid) return -ERANGE; return 0; } /** * dev_get_hwtstamp_phylib() - Get hardware timestamping settings of NIC * or of attached phylib PHY * @dev: Network device * @cfg: Timestamping configuration structure * * Helper for calling the default hardware provider timestamping. * * Note: phy_mii_ioctl() only handles SIOCSHWTSTAMP (not SIOCGHWTSTAMP), but * phydev->mii_ts has both hwtstamp_get() and hwtstamp_set() methods. So this * will return -EOPNOTSUPP for phylib only if hwtstamp_get() is not * implemented for now, which is still more accurate than letting the netdev * handle the GET request. */ int dev_get_hwtstamp_phylib(struct net_device *dev, struct kernel_hwtstamp_config *cfg) { struct hwtstamp_provider *hwprov; hwprov = rtnl_dereference(dev->hwprov); if (hwprov) { cfg->qualifier = hwprov->desc.qualifier; if (hwprov->source == HWTSTAMP_SOURCE_PHYLIB && hwprov->phydev) return phy_hwtstamp_get(hwprov->phydev, cfg); if (hwprov->source == HWTSTAMP_SOURCE_NETDEV) return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg); return -EOPNOTSUPP; } if (phy_is_default_hwtstamp(dev->phydev)) return phy_hwtstamp_get(dev->phydev, cfg); return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg); } static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr) { const struct net_device_ops *ops = dev->netdev_ops; struct kernel_hwtstamp_config kernel_cfg = {}; struct hwtstamp_config cfg; int err; if (!ops->ndo_hwtstamp_get) return dev_eth_ioctl(dev, ifr, SIOCGHWTSTAMP); /* legacy */ if (!netif_device_present(dev)) return -ENODEV; kernel_cfg.ifr = ifr; netdev_lock_ops(dev); err = dev_get_hwtstamp_phylib(dev, &kernel_cfg); netdev_unlock_ops(dev); if (err) return err; /* If the request was resolved through an unconverted driver, omit * the copy_to_user(), since the implementation has already done that */ if (!kernel_cfg.copied_to_user) { hwtstamp_config_from_kernel(&cfg, &kernel_cfg); if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg))) return -EFAULT; } return 0; } /** * dev_set_hwtstamp_phylib() - Change hardware timestamping of NIC * or of attached phylib PHY * @dev: Network device * @cfg: Timestamping configuration structure * @extack: Netlink extended ack message structure, for error reporting * * Helper for enforcing a common policy that phylib timestamping, if available, * should take precedence in front of hardware timestamping provided by the * netdev. If the netdev driver needs to perform specific actions even for PHY * timestamping to work properly (a switch port must trap the timestamped * frames and not forward them), it must set dev->see_all_hwtstamp_requests. */ int dev_set_hwtstamp_phylib(struct net_device *dev, struct kernel_hwtstamp_config *cfg, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; struct kernel_hwtstamp_config old_cfg = {}; struct hwtstamp_provider *hwprov; struct phy_device *phydev; bool changed = false; bool phy_ts; int err; hwprov = rtnl_dereference(dev->hwprov); if (hwprov) { if (hwprov->source == HWTSTAMP_SOURCE_PHYLIB && hwprov->phydev) { phy_ts = true; phydev = hwprov->phydev; } else if (hwprov->source == HWTSTAMP_SOURCE_NETDEV) { phy_ts = false; } else { return -EOPNOTSUPP; } cfg->qualifier = hwprov->desc.qualifier; } else { phy_ts = phy_is_default_hwtstamp(dev->phydev); if (phy_ts) phydev = dev->phydev; } cfg->source = phy_ts ? HWTSTAMP_SOURCE_PHYLIB : HWTSTAMP_SOURCE_NETDEV; if (phy_ts && dev->see_all_hwtstamp_requests) { err = ops->ndo_hwtstamp_get(dev, &old_cfg); if (err) return err; } if (!phy_ts || dev->see_all_hwtstamp_requests) { err = ops->ndo_hwtstamp_set(dev, cfg, extack); if (err) { if (extack->_msg) netdev_err(dev, "%s\n", extack->_msg); return err; } } if (phy_ts && dev->see_all_hwtstamp_requests) changed = kernel_hwtstamp_config_changed(&old_cfg, cfg); if (phy_ts) { err = phy_hwtstamp_set(phydev, cfg, extack); if (err) { if (changed) ops->ndo_hwtstamp_set(dev, &old_cfg, NULL); return err; } } return 0; } static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr) { const struct net_device_ops *ops = dev->netdev_ops; struct kernel_hwtstamp_config kernel_cfg = {}; struct netlink_ext_ack extack = {}; struct hwtstamp_config cfg; int err; if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg))) return -EFAULT; hwtstamp_config_to_kernel(&kernel_cfg, &cfg); kernel_cfg.ifr = ifr; err = net_hwtstamp_validate(&kernel_cfg); if (err) return err; err = dsa_conduit_hwtstamp_validate(dev, &kernel_cfg, &extack); if (err) { if (extack._msg) netdev_err(dev, "%s\n", extack._msg); return err; } if (!ops->ndo_hwtstamp_set) return dev_eth_ioctl(dev, ifr, SIOCSHWTSTAMP); /* legacy */ if (!netif_device_present(dev)) return -ENODEV; netdev_lock_ops(dev); err = dev_set_hwtstamp_phylib(dev, &kernel_cfg, &extack); netdev_unlock_ops(dev); if (err) return err; /* The driver may have modified the configuration, so copy the * updated version of it back to user space */ if (!kernel_cfg.copied_to_user) { hwtstamp_config_from_kernel(&cfg, &kernel_cfg); if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg))) return -EFAULT; } return 0; } static int generic_hwtstamp_ioctl_lower(struct net_device *dev, int cmd, struct kernel_hwtstamp_config *kernel_cfg) { struct ifreq ifrr; int err; if (!kernel_cfg->ifr) return -EINVAL; strscpy_pad(ifrr.ifr_name, dev->name, IFNAMSIZ); ifrr.ifr_ifru = kernel_cfg->ifr->ifr_ifru; err = dev_eth_ioctl(dev, &ifrr, cmd); if (err) return err; kernel_cfg->ifr->ifr_ifru = ifrr.ifr_ifru; kernel_cfg->copied_to_user = true; return 0; } int generic_hwtstamp_get_lower(struct net_device *dev, struct kernel_hwtstamp_config *kernel_cfg) { const struct net_device_ops *ops = dev->netdev_ops; if (!netif_device_present(dev)) return -ENODEV; if (ops->ndo_hwtstamp_get) { int err; netdev_lock_ops(dev); err = dev_get_hwtstamp_phylib(dev, kernel_cfg); netdev_unlock_ops(dev); return err; } /* Legacy path: unconverted lower driver */ return generic_hwtstamp_ioctl_lower(dev, SIOCGHWTSTAMP, kernel_cfg); } EXPORT_SYMBOL(generic_hwtstamp_get_lower); int generic_hwtstamp_set_lower(struct net_device *dev, struct kernel_hwtstamp_config *kernel_cfg, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; if (!netif_device_present(dev)) return -ENODEV; if (ops->ndo_hwtstamp_set) { int err; netdev_lock_ops(dev); err = dev_set_hwtstamp_phylib(dev, kernel_cfg, extack); netdev_unlock_ops(dev); return err; } /* Legacy path: unconverted lower driver */ return generic_hwtstamp_ioctl_lower(dev, SIOCSHWTSTAMP, kernel_cfg); } EXPORT_SYMBOL(generic_hwtstamp_set_lower); static int dev_siocbond(struct net_device *dev, struct ifreq *ifr, unsigned int cmd) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_siocbond) { int ret = -ENODEV; netdev_lock_ops(dev); if (netif_device_present(dev)) ret = ops->ndo_siocbond(dev, ifr, cmd); netdev_unlock_ops(dev); return ret; } return -EOPNOTSUPP; } static int dev_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, unsigned int cmd) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_siocdevprivate) { int ret = -ENODEV; netdev_lock_ops(dev); if (netif_device_present(dev)) ret = ops->ndo_siocdevprivate(dev, ifr, data, cmd); netdev_unlock_ops(dev); return ret; } return -EOPNOTSUPP; } static int dev_siocwandev(struct net_device *dev, struct if_settings *ifs) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_siocwandev) { int ret = -ENODEV; netdev_lock_ops(dev); if (netif_device_present(dev)) ret = ops->ndo_siocwandev(dev, ifs); netdev_unlock_ops(dev); return ret; } return -EOPNOTSUPP; } /* * Perform the SIOCxIFxxx calls, inside rtnl_net_lock() */ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, unsigned int cmd) { int err; struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); const struct net_device_ops *ops; if (!dev) return -ENODEV; ops = dev->netdev_ops; switch (cmd) { case SIOCSIFFLAGS: /* Set interface flags */ return dev_change_flags(dev, ifr->ifr_flags, NULL); case SIOCSIFMETRIC: /* Set the metric on the interface (currently unused) */ return -EOPNOTSUPP; case SIOCSIFMTU: /* Set the MTU of a device */ return dev_set_mtu(dev, ifr->ifr_mtu); case SIOCSIFHWADDR: if (dev->addr_len > sizeof(ifr->ifr_hwaddr)) return -EINVAL; return dev_set_mac_address_user(dev, (struct sockaddr_storage *)&ifr->ifr_hwaddr, NULL); case SIOCSIFHWBROADCAST: if (ifr->ifr_hwaddr.sa_family != dev->type) return -EINVAL; memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, min(sizeof(ifr->ifr_hwaddr.sa_data), (size_t)dev->addr_len)); netdev_lock_ops(dev); call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); netdev_unlock_ops(dev); return 0; case SIOCSIFMAP: netdev_lock_ops(dev); err = netif_setifmap(dev, ifr); netdev_unlock_ops(dev); return err; case SIOCADDMULTI: if (!ops->ndo_set_rx_mode || ifr->ifr_hwaddr.sa_family != AF_UNSPEC) return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; netdev_lock_ops(dev); err = dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data); netdev_unlock_ops(dev); return err; case SIOCDELMULTI: if (!ops->ndo_set_rx_mode || ifr->ifr_hwaddr.sa_family != AF_UNSPEC) return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; netdev_lock_ops(dev); err = dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data); netdev_unlock_ops(dev); return err; case SIOCSIFTXQLEN: if (ifr->ifr_qlen < 0) return -EINVAL; return dev_change_tx_queue_len(dev, ifr->ifr_qlen); case SIOCSIFNAME: ifr->ifr_newname[IFNAMSIZ-1] = '\0'; return dev_change_name(dev, ifr->ifr_newname); case SIOCWANDEV: return dev_siocwandev(dev, &ifr->ifr_settings); case SIOCDEVPRIVATE ... SIOCDEVPRIVATE + 15: return dev_siocdevprivate(dev, ifr, data, cmd); case SIOCSHWTSTAMP: return dev_set_hwtstamp(dev, ifr); case SIOCGHWTSTAMP: return dev_get_hwtstamp(dev, ifr); case SIOCGMIIPHY: case SIOCGMIIREG: case SIOCSMIIREG: return dev_eth_ioctl(dev, ifr, cmd); case SIOCBONDENSLAVE: case SIOCBONDRELEASE: case SIOCBONDSETHWADDR: case SIOCBONDSLAVEINFOQUERY: case SIOCBONDINFOQUERY: case SIOCBONDCHANGEACTIVE: return dev_siocbond(dev, ifr, cmd); /* Unknown ioctl */ default: err = -EINVAL; } return err; } /** * dev_load - load a network module * @net: the applicable net namespace * @name: name of interface * * If a network interface is not present and the process has suitable * privileges this function loads the module. If module loading is not * available in this kernel then it becomes a nop. */ void dev_load(struct net *net, const char *name) { struct net_device *dev; int no_module; rcu_read_lock(); dev = dev_get_by_name_rcu(net, name); rcu_read_unlock(); no_module = !dev; if (no_module && capable(CAP_NET_ADMIN)) no_module = request_module("netdev-%s", name); if (no_module && capable(CAP_SYS_MODULE)) request_module("%s", name); } EXPORT_SYMBOL(dev_load); /* * This function handles all "interface"-type I/O control requests. The actual * 'doing' part of this is dev_ifsioc above. */ /** * dev_ioctl - network device ioctl * @net: the applicable net namespace * @cmd: command to issue * @ifr: pointer to a struct ifreq in user space * @data: data exchanged with userspace * @need_copyout: whether or not copy_to_user() should be called * * Issue ioctl functions to devices. This is normally called by the * user space syscall interfaces but can sometimes be useful for * other purposes. The return value is the return from the syscall if * positive or a negative errno code on error. */ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, void __user *data, bool *need_copyout) { int ret; char *colon; if (need_copyout) *need_copyout = true; if (cmd == SIOCGIFNAME) return dev_ifname(net, ifr); ifr->ifr_name[IFNAMSIZ-1] = 0; colon = strchr(ifr->ifr_name, ':'); if (colon) *colon = 0; /* * See which interface the caller is talking about. */ switch (cmd) { case SIOCGIFHWADDR: dev_load(net, ifr->ifr_name); ret = netif_get_mac_address(&ifr->ifr_hwaddr, net, ifr->ifr_name); if (colon) *colon = ':'; return ret; /* * These ioctl calls: * - can be done by all. * - atomic and do not require locking. * - return a value */ case SIOCGIFFLAGS: case SIOCGIFMETRIC: case SIOCGIFMTU: case SIOCGIFSLAVE: case SIOCGIFMAP: case SIOCGIFINDEX: case SIOCGIFTXQLEN: dev_load(net, ifr->ifr_name); rcu_read_lock(); ret = dev_ifsioc_locked(net, ifr, cmd); rcu_read_unlock(); if (colon) *colon = ':'; return ret; case SIOCETHTOOL: dev_load(net, ifr->ifr_name); ret = dev_ethtool(net, ifr, data); if (colon) *colon = ':'; return ret; /* * These ioctl calls: * - require superuser power. * - require strict serialization. * - return a value */ case SIOCGMIIPHY: case SIOCGMIIREG: case SIOCSIFNAME: dev_load(net, ifr->ifr_name); if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; rtnl_net_lock(net); ret = dev_ifsioc(net, ifr, data, cmd); rtnl_net_unlock(net); if (colon) *colon = ':'; return ret; /* * These ioctl calls: * - require superuser power. * - require strict serialization. * - do not return a value */ case SIOCSIFMAP: case SIOCSIFTXQLEN: if (!capable(CAP_NET_ADMIN)) return -EPERM; fallthrough; /* * These ioctl calls: * - require local superuser power. * - require strict serialization. * - do not return a value */ case SIOCSIFFLAGS: case SIOCSIFMETRIC: case SIOCSIFMTU: case SIOCSIFHWADDR: case SIOCSIFSLAVE: case SIOCADDMULTI: case SIOCDELMULTI: case SIOCSIFHWBROADCAST: case SIOCSMIIREG: case SIOCBONDENSLAVE: case SIOCBONDRELEASE: case SIOCBONDSETHWADDR: case SIOCBONDCHANGEACTIVE: case SIOCSHWTSTAMP: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; fallthrough; case SIOCBONDSLAVEINFOQUERY: case SIOCBONDINFOQUERY: dev_load(net, ifr->ifr_name); rtnl_net_lock(net); ret = dev_ifsioc(net, ifr, data, cmd); rtnl_net_unlock(net); if (need_copyout) *need_copyout = false; return ret; case SIOCGIFMEM: /* Get the per device memory space. We can add this but * currently do not support it */ case SIOCSIFMEM: /* Set the per device memory buffer space. * Not applicable in our case */ case SIOCSIFLINK: return -ENOTTY; /* * Unknown or private ioctl. */ default: if (cmd == SIOCWANDEV || cmd == SIOCGHWTSTAMP || (cmd >= SIOCDEVPRIVATE && cmd <= SIOCDEVPRIVATE + 15)) { dev_load(net, ifr->ifr_name); rtnl_net_lock(net); ret = dev_ifsioc(net, ifr, data, cmd); rtnl_net_unlock(net); return ret; } return -ENOTTY; } } |
| 16655 13549 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 | /* * include/linux/topology.h * * Written by: Matthew Dobson, IBM Corporation * * Copyright (C) 2002, IBM Corp. * * All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or * NON INFRINGEMENT. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * Send feedback to <colpatch@us.ibm.com> */ #ifndef _LINUX_TOPOLOGY_H #define _LINUX_TOPOLOGY_H #include <linux/arch_topology.h> #include <linux/cpumask.h> #include <linux/nodemask.h> #include <linux/bitops.h> #include <linux/mmzone.h> #include <linux/smp.h> #include <linux/percpu.h> #include <asm/topology.h> #ifndef nr_cpus_node #define nr_cpus_node(node) cpumask_weight(cpumask_of_node(node)) #endif int arch_update_cpu_topology(void); /* Conform to ACPI 2.0 SLIT distance definitions */ #define LOCAL_DISTANCE 10 #define REMOTE_DISTANCE 20 #define DISTANCE_BITS 8 #ifndef node_distance #define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE) #endif #ifndef RECLAIM_DISTANCE /* * If the distance between nodes in a system is larger than RECLAIM_DISTANCE * (in whatever arch specific measurement units returned by node_distance()) * and node_reclaim_mode is enabled then the VM will only call node_reclaim() * on nodes within this distance. */ #define RECLAIM_DISTANCE 30 #endif /* * The following tunable allows platforms to override the default node * reclaim distance (RECLAIM_DISTANCE) if remote memory accesses are * sufficiently fast that the default value actually hurts * performance. * * AMD EPYC machines use this because even though the 2-hop distance * is 32 (3.2x slower than a local memory access) performance actually * *improves* if allowed to reclaim memory and load balance tasks * between NUMA nodes 2-hops apart. */ extern int __read_mostly node_reclaim_distance; #ifndef PENALTY_FOR_NODE_WITH_CPUS #define PENALTY_FOR_NODE_WITH_CPUS (1) #endif #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DECLARE_PER_CPU(int, numa_node); #ifndef numa_node_id /* Returns the number of the current Node. */ static inline int numa_node_id(void) { return raw_cpu_read(numa_node); } #endif #ifndef cpu_to_node static inline int cpu_to_node(int cpu) { return per_cpu(numa_node, cpu); } #endif #ifndef set_numa_node static inline void set_numa_node(int node) { this_cpu_write(numa_node, node); } #endif #ifndef set_cpu_numa_node static inline void set_cpu_numa_node(int cpu, int node) { per_cpu(numa_node, cpu) = node; } #endif #else /* !CONFIG_USE_PERCPU_NUMA_NODE_ID */ /* Returns the number of the current Node. */ #ifndef numa_node_id static inline int numa_node_id(void) { return cpu_to_node(raw_smp_processor_id()); } #endif #endif /* [!]CONFIG_USE_PERCPU_NUMA_NODE_ID */ #ifdef CONFIG_HAVE_MEMORYLESS_NODES /* * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem(). */ DECLARE_PER_CPU(int, _numa_mem_); #ifndef set_numa_mem static inline void set_numa_mem(int node) { this_cpu_write(_numa_mem_, node); } #endif #ifndef numa_mem_id /* Returns the number of the nearest Node with memory */ static inline int numa_mem_id(void) { return raw_cpu_read(_numa_mem_); } #endif #ifndef cpu_to_mem static inline int cpu_to_mem(int cpu) { return per_cpu(_numa_mem_, cpu); } #endif #ifndef set_cpu_numa_mem static inline void set_cpu_numa_mem(int cpu, int node) { per_cpu(_numa_mem_, cpu) = node; } #endif #else /* !CONFIG_HAVE_MEMORYLESS_NODES */ #ifndef numa_mem_id /* Returns the number of the nearest Node with memory */ static inline int numa_mem_id(void) { return numa_node_id(); } #endif #ifndef cpu_to_mem static inline int cpu_to_mem(int cpu) { return cpu_to_node(cpu); } #endif #endif /* [!]CONFIG_HAVE_MEMORYLESS_NODES */ #if defined(topology_die_id) && defined(topology_die_cpumask) #define TOPOLOGY_DIE_SYSFS #endif #if defined(topology_cluster_id) && defined(topology_cluster_cpumask) #define TOPOLOGY_CLUSTER_SYSFS #endif #if defined(topology_book_id) && defined(topology_book_cpumask) #define TOPOLOGY_BOOK_SYSFS #endif #if defined(topology_drawer_id) && defined(topology_drawer_cpumask) #define TOPOLOGY_DRAWER_SYSFS #endif #ifndef topology_physical_package_id #define topology_physical_package_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_die_id #define topology_die_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_cluster_id #define topology_cluster_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_core_id #define topology_core_id(cpu) ((void)(cpu), 0) #endif #ifndef topology_book_id #define topology_book_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_drawer_id #define topology_drawer_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_ppin #define topology_ppin(cpu) ((void)(cpu), 0ull) #endif #ifndef topology_sibling_cpumask #define topology_sibling_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_core_cpumask #define topology_core_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_cluster_cpumask #define topology_cluster_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_die_cpumask #define topology_die_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_book_cpumask #define topology_book_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_drawer_cpumask #define topology_drawer_cpumask(cpu) cpumask_of(cpu) #endif #if defined(CONFIG_SCHED_SMT) && !defined(cpu_smt_mask) static inline const struct cpumask *cpu_smt_mask(int cpu) { return topology_sibling_cpumask(cpu); } #endif #ifndef topology_is_primary_thread static inline bool topology_is_primary_thread(unsigned int cpu) { /* * When disabling SMT, the primary thread of the SMT will remain * enabled/active. Architectures that have a special primary thread * (e.g. x86) need to override this function. Otherwise the first * thread in the SMT can be made the primary thread. * * The sibling cpumask of an offline CPU always contains the CPU * itself on architectures using the implementation of * CONFIG_GENERIC_ARCH_TOPOLOGY for building their topology. * Other architectures not using CONFIG_GENERIC_ARCH_TOPOLOGY for * building their topology have to check whether to use this default * implementation or to override it. */ return cpu == cpumask_first(topology_sibling_cpumask(cpu)); } #define topology_is_primary_thread topology_is_primary_thread #endif static inline const struct cpumask *cpu_node_mask(int cpu) { return cpumask_of_node(cpu_to_node(cpu)); } #ifdef CONFIG_NUMA int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node); extern const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops); #else static __always_inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node) { return cpumask_nth_and(cpu, cpus, cpu_online_mask); } static inline const struct cpumask * sched_numa_hop_mask(unsigned int node, unsigned int hops) { return ERR_PTR(-EOPNOTSUPP); } #endif /* CONFIG_NUMA */ /** * for_each_node_numadist() - iterate over nodes in increasing distance * order, starting from a given node * @node: the iteration variable and the starting node. * @unvisited: a nodemask to keep track of the unvisited nodes. * * This macro iterates over NUMA node IDs in increasing distance from the * starting @node and yields MAX_NUMNODES when all the nodes have been * visited. * * Note that by the time the loop completes, the @unvisited nodemask will * be fully cleared, unless the loop exits early. * * The difference between for_each_node() and for_each_node_numadist() is * that the former allows to iterate over nodes in numerical order, whereas * the latter iterates over nodes in increasing order of distance. * * This complexity of this iterator is O(N^2), where N represents the * number of nodes, as each iteration involves scanning all nodes to * find the one with the shortest distance. * * Requires rcu_lock to be held. */ #define for_each_node_numadist(node, unvisited) \ for (int __start = (node), \ (node) = nearest_node_nodemask((__start), &(unvisited)); \ (node) < MAX_NUMNODES; \ node_clear((node), (unvisited)), \ (node) = nearest_node_nodemask((__start), &(unvisited))) /** * for_each_numa_hop_mask - iterate over cpumasks of increasing NUMA distance * from a given node. * @mask: the iteration variable. * @node: the NUMA node to start the search from. * * Requires rcu_lock to be held. * * Yields cpu_online_mask for @node == NUMA_NO_NODE. */ #define for_each_numa_hop_mask(mask, node) \ for (unsigned int __hops = 0; \ mask = (node != NUMA_NO_NODE || __hops) ? \ sched_numa_hop_mask(node, __hops) : \ cpu_online_mask, \ !IS_ERR_OR_NULL(mask); \ __hops++) DECLARE_PER_CPU(unsigned long, cpu_scale); static inline unsigned long topology_get_cpu_scale(int cpu) { return per_cpu(cpu_scale, cpu); } void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity); #endif /* _LINUX_TOPOLOGY_H */ |
| 24 33 14 2 36 316 3 27 1 107 1 31 88 101 1 9 248 242 21 8 17 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Symmetric key ciphers. * * Copyright (c) 2007-2015 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_SKCIPHER_H #define _CRYPTO_SKCIPHER_H #include <linux/atomic.h> #include <linux/container_of.h> #include <linux/crypto.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/types.h> /* Set this bit if the lskcipher operation is a continuation. */ #define CRYPTO_LSKCIPHER_FLAG_CONT 0x00000001 /* Set this bit if the lskcipher operation is final. */ #define CRYPTO_LSKCIPHER_FLAG_FINAL 0x00000002 /* The bit CRYPTO_TFM_REQ_MAY_SLEEP can also be set if needed. */ /* Set this bit if the skcipher operation is a continuation. */ #define CRYPTO_SKCIPHER_REQ_CONT 0x00000001 /* Set this bit if the skcipher operation is not final. */ #define CRYPTO_SKCIPHER_REQ_NOTFINAL 0x00000002 struct scatterlist; /** * struct skcipher_request - Symmetric key cipher request * @cryptlen: Number of bytes to encrypt or decrypt * @iv: Initialisation Vector * @src: Source SG list * @dst: Destination SG list * @base: Underlying async request * @__ctx: Start of private context data */ struct skcipher_request { unsigned int cryptlen; u8 *iv; struct scatterlist *src; struct scatterlist *dst; struct crypto_async_request base; void *__ctx[] CRYPTO_MINALIGN_ATTR; }; struct crypto_skcipher { unsigned int reqsize; struct crypto_tfm base; }; struct crypto_sync_skcipher { struct crypto_skcipher base; }; struct crypto_lskcipher { struct crypto_tfm base; }; /* * struct skcipher_alg_common - common properties of skcipher_alg * @min_keysize: Minimum key size supported by the transformation. This is the * smallest key length supported by this transformation algorithm. * This must be set to one of the pre-defined values as this is * not hardware specific. Possible values for this field can be * found via git grep "_MIN_KEY_SIZE" include/crypto/ * @max_keysize: Maximum key size supported by the transformation. This is the * largest key length supported by this transformation algorithm. * This must be set to one of the pre-defined values as this is * not hardware specific. Possible values for this field can be * found via git grep "_MAX_KEY_SIZE" include/crypto/ * @ivsize: IV size applicable for transformation. The consumer must provide an * IV of exactly that size to perform the encrypt or decrypt operation. * @chunksize: Equal to the block size except for stream ciphers such as * CTR where it is set to the underlying block size. * @statesize: Size of the internal state for the algorithm. * @base: Definition of a generic crypto algorithm. */ #define SKCIPHER_ALG_COMMON { \ unsigned int min_keysize; \ unsigned int max_keysize; \ unsigned int ivsize; \ unsigned int chunksize; \ unsigned int statesize; \ \ struct crypto_alg base; \ } struct skcipher_alg_common SKCIPHER_ALG_COMMON; /** * struct skcipher_alg - symmetric key cipher definition * @setkey: Set key for the transformation. This function is used to either * program a supplied key into the hardware or store the key in the * transformation context for programming it later. Note that this * function does modify the transformation context. This function can * be called multiple times during the existence of the transformation * object, so one must make sure the key is properly reprogrammed into * the hardware. This function is also responsible for checking the key * length for validity. In case a software fallback was put in place in * the @cra_init call, this function might need to use the fallback if * the algorithm doesn't support all of the key sizes. * @encrypt: Encrypt a scatterlist of blocks. This function is used to encrypt * the supplied scatterlist containing the blocks of data. The crypto * API consumer is responsible for aligning the entries of the * scatterlist properly and making sure the chunks are correctly * sized. In case a software fallback was put in place in the * @cra_init call, this function might need to use the fallback if * the algorithm doesn't support all of the key sizes. In case the * key was stored in transformation context, the key might need to be * re-programmed into the hardware in this function. This function * shall not modify the transformation context, as this function may * be called in parallel with the same transformation object. * @decrypt: Decrypt a single block. This is a reverse counterpart to @encrypt * and the conditions are exactly the same. * @export: Export partial state of the transformation. This function dumps the * entire state of the ongoing transformation into a provided block of * data so it can be @import 'ed back later on. This is useful in case * you want to save partial result of the transformation after * processing certain amount of data and reload this partial result * multiple times later on for multiple re-use. No data processing * happens at this point. * @import: Import partial state of the transformation. This function loads the * entire state of the ongoing transformation from a provided block of * data so the transformation can continue from this point onward. No * data processing happens at this point. * @init: Initialize the cryptographic transformation object. This function * is used to initialize the cryptographic transformation object. * This function is called only once at the instantiation time, right * after the transformation context was allocated. In case the * cryptographic hardware has some special requirements which need to * be handled by software, this function shall check for the precise * requirement of the transformation and put any software fallbacks * in place. * @exit: Deinitialize the cryptographic transformation object. This is a * counterpart to @init, used to remove various changes set in * @init. * @walksize: Equal to the chunk size except in cases where the algorithm is * considerably more efficient if it can operate on multiple chunks * in parallel. Should be a multiple of chunksize. * @co: see struct skcipher_alg_common * * All fields except @ivsize are mandatory and must be filled. */ struct skcipher_alg { int (*setkey)(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen); int (*encrypt)(struct skcipher_request *req); int (*decrypt)(struct skcipher_request *req); int (*export)(struct skcipher_request *req, void *out); int (*import)(struct skcipher_request *req, const void *in); int (*init)(struct crypto_skcipher *tfm); void (*exit)(struct crypto_skcipher *tfm); unsigned int walksize; union { struct SKCIPHER_ALG_COMMON; struct skcipher_alg_common co; }; }; /** * struct lskcipher_alg - linear symmetric key cipher definition * @setkey: Set key for the transformation. This function is used to either * program a supplied key into the hardware or store the key in the * transformation context for programming it later. Note that this * function does modify the transformation context. This function can * be called multiple times during the existence of the transformation * object, so one must make sure the key is properly reprogrammed into * the hardware. This function is also responsible for checking the key * length for validity. In case a software fallback was put in place in * the @cra_init call, this function might need to use the fallback if * the algorithm doesn't support all of the key sizes. * @encrypt: Encrypt a number of bytes. This function is used to encrypt * the supplied data. This function shall not modify * the transformation context, as this function may be called * in parallel with the same transformation object. Data * may be left over if length is not a multiple of blocks * and there is more to come (final == false). The number of * left-over bytes should be returned in case of success. * The siv field shall be as long as ivsize + statesize with * the IV placed at the front. The state will be used by the * algorithm internally. * @decrypt: Decrypt a number of bytes. This is a reverse counterpart to * @encrypt and the conditions are exactly the same. * @init: Initialize the cryptographic transformation object. This function * is used to initialize the cryptographic transformation object. * This function is called only once at the instantiation time, right * after the transformation context was allocated. * @exit: Deinitialize the cryptographic transformation object. This is a * counterpart to @init, used to remove various changes set in * @init. * @co: see struct skcipher_alg_common */ struct lskcipher_alg { int (*setkey)(struct crypto_lskcipher *tfm, const u8 *key, unsigned int keylen); int (*encrypt)(struct crypto_lskcipher *tfm, const u8 *src, u8 *dst, unsigned len, u8 *siv, u32 flags); int (*decrypt)(struct crypto_lskcipher *tfm, const u8 *src, u8 *dst, unsigned len, u8 *siv, u32 flags); int (*init)(struct crypto_lskcipher *tfm); void (*exit)(struct crypto_lskcipher *tfm); struct skcipher_alg_common co; }; #define MAX_SYNC_SKCIPHER_REQSIZE 384 /* * This performs a type-check against the "_tfm" argument to make sure * all users have the correct skcipher tfm for doing on-stack requests. */ #define SYNC_SKCIPHER_REQUEST_ON_STACK(name, _tfm) \ char __##name##_desc[sizeof(struct skcipher_request) + \ MAX_SYNC_SKCIPHER_REQSIZE \ ] CRYPTO_MINALIGN_ATTR; \ struct skcipher_request *name = \ (((struct skcipher_request *)__##name##_desc)->base.tfm = \ crypto_sync_skcipher_tfm((_tfm)), \ (void *)__##name##_desc) /** * DOC: Symmetric Key Cipher API * * Symmetric key cipher API is used with the ciphers of type * CRYPTO_ALG_TYPE_SKCIPHER (listed as type "skcipher" in /proc/crypto). * * Asynchronous cipher operations imply that the function invocation for a * cipher request returns immediately before the completion of the operation. * The cipher request is scheduled as a separate kernel thread and therefore * load-balanced on the different CPUs via the process scheduler. To allow * the kernel crypto API to inform the caller about the completion of a cipher * request, the caller must provide a callback function. That function is * invoked with the cipher handle when the request completes. * * To support the asynchronous operation, additional information than just the * cipher handle must be supplied to the kernel crypto API. That additional * information is given by filling in the skcipher_request data structure. * * For the symmetric key cipher API, the state is maintained with the tfm * cipher handle. A single tfm can be used across multiple calls and in * parallel. For asynchronous block cipher calls, context data supplied and * only used by the caller can be referenced the request data structure in * addition to the IV used for the cipher request. The maintenance of such * state information would be important for a crypto driver implementer to * have, because when calling the callback function upon completion of the * cipher operation, that callback function may need some information about * which operation just finished if it invoked multiple in parallel. This * state information is unused by the kernel crypto API. */ static inline struct crypto_skcipher *__crypto_skcipher_cast( struct crypto_tfm *tfm) { return container_of(tfm, struct crypto_skcipher, base); } /** * crypto_alloc_skcipher() - allocate symmetric key cipher handle * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * skcipher cipher * @type: specifies the type of the cipher * @mask: specifies the mask for the cipher * * Allocate a cipher handle for an skcipher. The returned struct * crypto_skcipher is the cipher handle that is required for any subsequent * API invocation for that skcipher. * * Return: allocated cipher handle in case of success; IS_ERR() is true in case * of an error, PTR_ERR() returns the error code. */ struct crypto_skcipher *crypto_alloc_skcipher(const char *alg_name, u32 type, u32 mask); struct crypto_sync_skcipher *crypto_alloc_sync_skcipher(const char *alg_name, u32 type, u32 mask); /** * crypto_alloc_lskcipher() - allocate linear symmetric key cipher handle * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * lskcipher * @type: specifies the type of the cipher * @mask: specifies the mask for the cipher * * Allocate a cipher handle for an lskcipher. The returned struct * crypto_lskcipher is the cipher handle that is required for any subsequent * API invocation for that lskcipher. * * Return: allocated cipher handle in case of success; IS_ERR() is true in case * of an error, PTR_ERR() returns the error code. */ struct crypto_lskcipher *crypto_alloc_lskcipher(const char *alg_name, u32 type, u32 mask); static inline struct crypto_tfm *crypto_skcipher_tfm( struct crypto_skcipher *tfm) { return &tfm->base; } static inline struct crypto_tfm *crypto_lskcipher_tfm( struct crypto_lskcipher *tfm) { return &tfm->base; } static inline struct crypto_tfm *crypto_sync_skcipher_tfm( struct crypto_sync_skcipher *tfm) { return crypto_skcipher_tfm(&tfm->base); } /** * crypto_free_skcipher() - zeroize and free cipher handle * @tfm: cipher handle to be freed * * If @tfm is a NULL or error pointer, this function does nothing. */ static inline void crypto_free_skcipher(struct crypto_skcipher *tfm) { crypto_destroy_tfm(tfm, crypto_skcipher_tfm(tfm)); } static inline void crypto_free_sync_skcipher(struct crypto_sync_skcipher *tfm) { crypto_free_skcipher(&tfm->base); } /** * crypto_free_lskcipher() - zeroize and free cipher handle * @tfm: cipher handle to be freed * * If @tfm is a NULL or error pointer, this function does nothing. */ static inline void crypto_free_lskcipher(struct crypto_lskcipher *tfm) { crypto_destroy_tfm(tfm, crypto_lskcipher_tfm(tfm)); } /** * crypto_has_skcipher() - Search for the availability of an skcipher. * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * skcipher * @type: specifies the type of the skcipher * @mask: specifies the mask for the skcipher * * Return: true when the skcipher is known to the kernel crypto API; false * otherwise */ int crypto_has_skcipher(const char *alg_name, u32 type, u32 mask); static inline const char *crypto_skcipher_driver_name( struct crypto_skcipher *tfm) { return crypto_tfm_alg_driver_name(crypto_skcipher_tfm(tfm)); } static inline const char *crypto_lskcipher_driver_name( struct crypto_lskcipher *tfm) { return crypto_tfm_alg_driver_name(crypto_lskcipher_tfm(tfm)); } static inline struct skcipher_alg_common *crypto_skcipher_alg_common( struct crypto_skcipher *tfm) { return container_of(crypto_skcipher_tfm(tfm)->__crt_alg, struct skcipher_alg_common, base); } static inline struct skcipher_alg *crypto_skcipher_alg( struct crypto_skcipher *tfm) { return container_of(crypto_skcipher_tfm(tfm)->__crt_alg, struct skcipher_alg, base); } static inline struct lskcipher_alg *crypto_lskcipher_alg( struct crypto_lskcipher *tfm) { return container_of(crypto_lskcipher_tfm(tfm)->__crt_alg, struct lskcipher_alg, co.base); } /** * crypto_skcipher_ivsize() - obtain IV size * @tfm: cipher handle * * The size of the IV for the skcipher referenced by the cipher handle is * returned. This IV size may be zero if the cipher does not need an IV. * * Return: IV size in bytes */ static inline unsigned int crypto_skcipher_ivsize(struct crypto_skcipher *tfm) { return crypto_skcipher_alg_common(tfm)->ivsize; } static inline unsigned int crypto_sync_skcipher_ivsize( struct crypto_sync_skcipher *tfm) { return crypto_skcipher_ivsize(&tfm->base); } /** * crypto_lskcipher_ivsize() - obtain IV size * @tfm: cipher handle * * The size of the IV for the lskcipher referenced by the cipher handle is * returned. This IV size may be zero if the cipher does not need an IV. * * Return: IV size in bytes */ static inline unsigned int crypto_lskcipher_ivsize( struct crypto_lskcipher *tfm) { return crypto_lskcipher_alg(tfm)->co.ivsize; } /** * crypto_skcipher_blocksize() - obtain block size of cipher * @tfm: cipher handle * * The block size for the skcipher referenced with the cipher handle is * returned. The caller may use that information to allocate appropriate * memory for the data returned by the encryption or decryption operation * * Return: block size of cipher */ static inline unsigned int crypto_skcipher_blocksize( struct crypto_skcipher *tfm) { return crypto_tfm_alg_blocksize(crypto_skcipher_tfm(tfm)); } /** * crypto_lskcipher_blocksize() - obtain block size of cipher * @tfm: cipher handle * * The block size for the lskcipher referenced with the cipher handle is * returned. The caller may use that information to allocate appropriate * memory for the data returned by the encryption or decryption operation * * Return: block size of cipher */ static inline unsigned int crypto_lskcipher_blocksize( struct crypto_lskcipher *tfm) { return crypto_tfm_alg_blocksize(crypto_lskcipher_tfm(tfm)); } /** * crypto_skcipher_chunksize() - obtain chunk size * @tfm: cipher handle * * The block size is set to one for ciphers such as CTR. However, * you still need to provide incremental updates in multiples of * the underlying block size as the IV does not have sub-block * granularity. This is known in this API as the chunk size. * * Return: chunk size in bytes */ static inline unsigned int crypto_skcipher_chunksize( struct crypto_skcipher *tfm) { return crypto_skcipher_alg_common(tfm)->chunksize; } /** * crypto_lskcipher_chunksize() - obtain chunk size * @tfm: cipher handle * * The block size is set to one for ciphers such as CTR. However, * you still need to provide incremental updates in multiples of * the underlying block size as the IV does not have sub-block * granularity. This is known in this API as the chunk size. * * Return: chunk size in bytes */ static inline unsigned int crypto_lskcipher_chunksize( struct crypto_lskcipher *tfm) { return crypto_lskcipher_alg(tfm)->co.chunksize; } /** * crypto_skcipher_statesize() - obtain state size * @tfm: cipher handle * * Some algorithms cannot be chained with the IV alone. They carry * internal state which must be replicated if data is to be processed * incrementally. The size of that state can be obtained with this * function. * * Return: state size in bytes */ static inline unsigned int crypto_skcipher_statesize( struct crypto_skcipher *tfm) { return crypto_skcipher_alg_common(tfm)->statesize; } /** * crypto_lskcipher_statesize() - obtain state size * @tfm: cipher handle * * Some algorithms cannot be chained with the IV alone. They carry * internal state which must be replicated if data is to be processed * incrementally. The size of that state can be obtained with this * function. * * Return: state size in bytes */ static inline unsigned int crypto_lskcipher_statesize( struct crypto_lskcipher *tfm) { return crypto_lskcipher_alg(tfm)->co.statesize; } static inline unsigned int crypto_sync_skcipher_blocksize( struct crypto_sync_skcipher *tfm) { return crypto_skcipher_blocksize(&tfm->base); } static inline unsigned int crypto_skcipher_alignmask( struct crypto_skcipher *tfm) { return crypto_tfm_alg_alignmask(crypto_skcipher_tfm(tfm)); } static inline unsigned int crypto_lskcipher_alignmask( struct crypto_lskcipher *tfm) { return crypto_tfm_alg_alignmask(crypto_lskcipher_tfm(tfm)); } static inline u32 crypto_skcipher_get_flags(struct crypto_skcipher *tfm) { return crypto_tfm_get_flags(crypto_skcipher_tfm(tfm)); } static inline void crypto_skcipher_set_flags(struct crypto_skcipher *tfm, u32 flags) { crypto_tfm_set_flags(crypto_skcipher_tfm(tfm), flags); } static inline void crypto_skcipher_clear_flags(struct crypto_skcipher *tfm, u32 flags) { crypto_tfm_clear_flags(crypto_skcipher_tfm(tfm), flags); } static inline u32 crypto_sync_skcipher_get_flags( struct crypto_sync_skcipher *tfm) { return crypto_skcipher_get_flags(&tfm->base); } static inline void crypto_sync_skcipher_set_flags( struct crypto_sync_skcipher *tfm, u32 flags) { crypto_skcipher_set_flags(&tfm->base, flags); } static inline void crypto_sync_skcipher_clear_flags( struct crypto_sync_skcipher *tfm, u32 flags) { crypto_skcipher_clear_flags(&tfm->base, flags); } static inline u32 crypto_lskcipher_get_flags(struct crypto_lskcipher *tfm) { return crypto_tfm_get_flags(crypto_lskcipher_tfm(tfm)); } static inline void crypto_lskcipher_set_flags(struct crypto_lskcipher *tfm, u32 flags) { crypto_tfm_set_flags(crypto_lskcipher_tfm(tfm), flags); } static inline void crypto_lskcipher_clear_flags(struct crypto_lskcipher *tfm, u32 flags) { crypto_tfm_clear_flags(crypto_lskcipher_tfm(tfm), flags); } /** * crypto_skcipher_setkey() - set key for cipher * @tfm: cipher handle * @key: buffer holding the key * @keylen: length of the key in bytes * * The caller provided key is set for the skcipher referenced by the cipher * handle. * * Note, the key length determines the cipher type. Many block ciphers implement * different cipher modes depending on the key size, such as AES-128 vs AES-192 * vs. AES-256. When providing a 16 byte key for an AES cipher handle, AES-128 * is performed. * * Return: 0 if the setting of the key was successful; < 0 if an error occurred */ int crypto_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen); static inline int crypto_sync_skcipher_setkey(struct crypto_sync_skcipher *tfm, const u8 *key, unsigned int keylen) { return crypto_skcipher_setkey(&tfm->base, key, keylen); } /** * crypto_lskcipher_setkey() - set key for cipher * @tfm: cipher handle * @key: buffer holding the key * @keylen: length of the key in bytes * * The caller provided key is set for the lskcipher referenced by the cipher * handle. * * Note, the key length determines the cipher type. Many block ciphers implement * different cipher modes depending on the key size, such as AES-128 vs AES-192 * vs. AES-256. When providing a 16 byte key for an AES cipher handle, AES-128 * is performed. * * Return: 0 if the setting of the key was successful; < 0 if an error occurred */ int crypto_lskcipher_setkey(struct crypto_lskcipher *tfm, const u8 *key, unsigned int keylen); static inline unsigned int crypto_skcipher_min_keysize( struct crypto_skcipher *tfm) { return crypto_skcipher_alg_common(tfm)->min_keysize; } static inline unsigned int crypto_skcipher_max_keysize( struct crypto_skcipher *tfm) { return crypto_skcipher_alg_common(tfm)->max_keysize; } static inline unsigned int crypto_lskcipher_min_keysize( struct crypto_lskcipher *tfm) { return crypto_lskcipher_alg(tfm)->co.min_keysize; } static inline unsigned int crypto_lskcipher_max_keysize( struct crypto_lskcipher *tfm) { return crypto_lskcipher_alg(tfm)->co.max_keysize; } /** * crypto_skcipher_reqtfm() - obtain cipher handle from request * @req: skcipher_request out of which the cipher handle is to be obtained * * Return the crypto_skcipher handle when furnishing an skcipher_request * data structure. * * Return: crypto_skcipher handle */ static inline struct crypto_skcipher *crypto_skcipher_reqtfm( struct skcipher_request *req) { return __crypto_skcipher_cast(req->base.tfm); } static inline struct crypto_sync_skcipher *crypto_sync_skcipher_reqtfm( struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); return container_of(tfm, struct crypto_sync_skcipher, base); } /** * crypto_skcipher_encrypt() - encrypt plaintext * @req: reference to the skcipher_request handle that holds all information * needed to perform the cipher operation * * Encrypt plaintext data using the skcipher_request handle. That data * structure and how it is filled with data is discussed with the * skcipher_request_* functions. * * Return: 0 if the cipher operation was successful; < 0 if an error occurred */ int crypto_skcipher_encrypt(struct skcipher_request *req); /** * crypto_skcipher_decrypt() - decrypt ciphertext * @req: reference to the skcipher_request handle that holds all information * needed to perform the cipher operation * * Decrypt ciphertext data using the skcipher_request handle. That data * structure and how it is filled with data is discussed with the * skcipher_request_* functions. * * Return: 0 if the cipher operation was successful; < 0 if an error occurred */ int crypto_skcipher_decrypt(struct skcipher_request *req); /** * crypto_skcipher_export() - export partial state * @req: reference to the skcipher_request handle that holds all information * needed to perform the operation * @out: output buffer of sufficient size that can hold the state * * Export partial state of the transformation. This function dumps the * entire state of the ongoing transformation into a provided block of * data so it can be @import 'ed back later on. This is useful in case * you want to save partial result of the transformation after * processing certain amount of data and reload this partial result * multiple times later on for multiple re-use. No data processing * happens at this point. * * Return: 0 if the cipher operation was successful; < 0 if an error occurred */ int crypto_skcipher_export(struct skcipher_request *req, void *out); /** * crypto_skcipher_import() - import partial state * @req: reference to the skcipher_request handle that holds all information * needed to perform the operation * @in: buffer holding the state * * Import partial state of the transformation. This function loads the * entire state of the ongoing transformation from a provided block of * data so the transformation can continue from this point onward. No * data processing happens at this point. * * Return: 0 if the cipher operation was successful; < 0 if an error occurred */ int crypto_skcipher_import(struct skcipher_request *req, const void *in); /** * crypto_lskcipher_encrypt() - encrypt plaintext * @tfm: lskcipher handle * @src: source buffer * @dst: destination buffer * @len: number of bytes to process * @siv: IV + state for the cipher operation. The length of the IV must * comply with the IV size defined by crypto_lskcipher_ivsize. The * IV is then followed with a buffer with the length as specified by * crypto_lskcipher_statesize. * Encrypt plaintext data using the lskcipher handle. * * Return: >=0 if the cipher operation was successful, if positive * then this many bytes have been left unprocessed; * < 0 if an error occurred */ int crypto_lskcipher_encrypt(struct crypto_lskcipher *tfm, const u8 *src, u8 *dst, unsigned len, u8 *siv); /** * crypto_lskcipher_decrypt() - decrypt ciphertext * @tfm: lskcipher handle * @src: source buffer * @dst: destination buffer * @len: number of bytes to process * @siv: IV + state for the cipher operation. The length of the IV must * comply with the IV size defined by crypto_lskcipher_ivsize. The * IV is then followed with a buffer with the length as specified by * crypto_lskcipher_statesize. * * Decrypt ciphertext data using the lskcipher handle. * * Return: >=0 if the cipher operation was successful, if positive * then this many bytes have been left unprocessed; * < 0 if an error occurred */ int crypto_lskcipher_decrypt(struct crypto_lskcipher *tfm, const u8 *src, u8 *dst, unsigned len, u8 *siv); /** * DOC: Symmetric Key Cipher Request Handle * * The skcipher_request data structure contains all pointers to data * required for the symmetric key cipher operation. This includes the cipher * handle (which can be used by multiple skcipher_request instances), pointer * to plaintext and ciphertext, asynchronous callback function, etc. It acts * as a handle to the skcipher_request_* API calls in a similar way as * skcipher handle to the crypto_skcipher_* API calls. */ /** * crypto_skcipher_reqsize() - obtain size of the request data structure * @tfm: cipher handle * * Return: number of bytes */ static inline unsigned int crypto_skcipher_reqsize(struct crypto_skcipher *tfm) { return tfm->reqsize; } /** * skcipher_request_set_tfm() - update cipher handle reference in request * @req: request handle to be modified * @tfm: cipher handle that shall be added to the request handle * * Allow the caller to replace the existing skcipher handle in the request * data structure with a different one. */ static inline void skcipher_request_set_tfm(struct skcipher_request *req, struct crypto_skcipher *tfm) { req->base.tfm = crypto_skcipher_tfm(tfm); } static inline void skcipher_request_set_sync_tfm(struct skcipher_request *req, struct crypto_sync_skcipher *tfm) { skcipher_request_set_tfm(req, &tfm->base); } static inline struct skcipher_request *skcipher_request_cast( struct crypto_async_request *req) { return container_of(req, struct skcipher_request, base); } /** * skcipher_request_alloc() - allocate request data structure * @tfm: cipher handle to be registered with the request * @gfp: memory allocation flag that is handed to kmalloc by the API call. * * Allocate the request data structure that must be used with the skcipher * encrypt and decrypt API calls. During the allocation, the provided skcipher * handle is registered in the request data structure. * * Return: allocated request handle in case of success, or NULL if out of memory */ static inline struct skcipher_request *skcipher_request_alloc_noprof( struct crypto_skcipher *tfm, gfp_t gfp) { struct skcipher_request *req; req = kmalloc_noprof(sizeof(struct skcipher_request) + crypto_skcipher_reqsize(tfm), gfp); if (likely(req)) skcipher_request_set_tfm(req, tfm); return req; } #define skcipher_request_alloc(...) alloc_hooks(skcipher_request_alloc_noprof(__VA_ARGS__)) /** * skcipher_request_free() - zeroize and free request data structure * @req: request data structure cipher handle to be freed */ static inline void skcipher_request_free(struct skcipher_request *req) { kfree_sensitive(req); } static inline void skcipher_request_zero(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); memzero_explicit(req, sizeof(*req) + crypto_skcipher_reqsize(tfm)); } /** * skcipher_request_set_callback() - set asynchronous callback function * @req: request handle * @flags: specify zero or an ORing of the flags * CRYPTO_TFM_REQ_MAY_BACKLOG the request queue may back log and * increase the wait queue beyond the initial maximum size; * CRYPTO_TFM_REQ_MAY_SLEEP the request processing may sleep * @compl: callback function pointer to be registered with the request handle * @data: The data pointer refers to memory that is not used by the kernel * crypto API, but provided to the callback function for it to use. Here, * the caller can provide a reference to memory the callback function can * operate on. As the callback function is invoked asynchronously to the * related functionality, it may need to access data structures of the * related functionality which can be referenced using this pointer. The * callback function can access the memory via the "data" field in the * crypto_async_request data structure provided to the callback function. * * This function allows setting the callback function that is triggered once the * cipher operation completes. * * The callback function is registered with the skcipher_request handle and * must comply with the following template:: * * void callback_function(struct crypto_async_request *req, int error) */ static inline void skcipher_request_set_callback(struct skcipher_request *req, u32 flags, crypto_completion_t compl, void *data) { req->base.complete = compl; req->base.data = data; req->base.flags = flags; } /** * skcipher_request_set_crypt() - set data buffers * @req: request handle * @src: source scatter / gather list * @dst: destination scatter / gather list * @cryptlen: number of bytes to process from @src * @iv: IV for the cipher operation which must comply with the IV size defined * by crypto_skcipher_ivsize * * This function allows setting of the source data and destination data * scatter / gather lists. * * For encryption, the source is treated as the plaintext and the * destination is the ciphertext. For a decryption operation, the use is * reversed - the source is the ciphertext and the destination is the plaintext. */ static inline void skcipher_request_set_crypt( struct skcipher_request *req, struct scatterlist *src, struct scatterlist *dst, unsigned int cryptlen, void *iv) { req->src = src; req->dst = dst; req->cryptlen = cryptlen; req->iv = iv; } #endif /* _CRYPTO_SKCIPHER_H */ |
| 114 114 113 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_VIRTIO_RING_H #define _LINUX_VIRTIO_RING_H #include <asm/barrier.h> #include <linux/virtio.h> #include <linux/irqreturn.h> #include <uapi/linux/virtio_ring.h> /* * Barriers in virtio are tricky. Non-SMP virtio guests can't assume * they're not on an SMP host system, so they need to assume real * barriers. Non-SMP virtio hosts could skip the barriers, but does * anyone care? * * For virtio_pci on SMP, we don't need to order with respect to MMIO * accesses through relaxed memory I/O windows, so virt_mb() et al are * sufficient. * * For using virtio to talk to real devices (eg. other heterogeneous * CPUs) we do need real barriers. In theory, we could be using both * kinds of virtio, so it's a runtime decision, and the branch is * actually quite cheap. */ static inline void virtio_mb(bool weak_barriers) { if (weak_barriers) virt_mb(); else mb(); } static inline void virtio_rmb(bool weak_barriers) { if (weak_barriers) virt_rmb(); else dma_rmb(); } static inline void virtio_wmb(bool weak_barriers) { if (weak_barriers) virt_wmb(); else dma_wmb(); } #define virtio_store_mb(weak_barriers, p, v) \ do { \ if (weak_barriers) { \ virt_store_mb(*p, v); \ } else { \ WRITE_ONCE(*p, v); \ mb(); \ } \ } while (0) \ struct virtio_device; struct virtqueue; struct device; /* * Creates a virtqueue and allocates the descriptor ring. If * may_reduce_num is set, then this may allocate a smaller ring than * expected. The caller should query virtqueue_get_vring_size to learn * the actual size of the ring. */ struct virtqueue *vring_create_virtqueue(unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, bool may_reduce_num, bool ctx, bool (*notify)(struct virtqueue *vq), void (*callback)(struct virtqueue *vq), const char *name); /* * Creates a virtqueue and allocates the descriptor ring with per * virtqueue mapping operations. */ struct virtqueue *vring_create_virtqueue_map(unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, bool may_reduce_num, bool ctx, bool (*notify)(struct virtqueue *vq), void (*callback)(struct virtqueue *vq), const char *name, union virtio_map map); /* * Creates a virtqueue with a standard layout but a caller-allocated * ring. */ struct virtqueue *vring_new_virtqueue(unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, bool ctx, void *pages, bool (*notify)(struct virtqueue *vq), void (*callback)(struct virtqueue *vq), const char *name); /* * Destroys a virtqueue. If created with vring_create_virtqueue, this * also frees the ring. */ void vring_del_virtqueue(struct virtqueue *vq); /* Filter out transport-specific feature bits. */ void vring_transport_features(struct virtio_device *vdev); irqreturn_t vring_interrupt(int irq, void *_vq); u32 vring_notification_data(struct virtqueue *_vq); #endif /* _LINUX_VIRTIO_RING_H */ |
| 18 44 45 39 39 39 13 38 6 9 38 22 22 21 22 22 22 31 33 33 22 22 22 21 22 36 37 36 36 33 37 36 7 8 8 8 8 8 4 4 4 4 22 4 7 4 4 4 4 4 4 4 4 4 22 22 22 21 22 22 21 22 21 22 22 22 21 22 21 22 22 22 22 22 22 22 21 22 22 22 22 22 22 22 22 35 35 35 32 27 6 8 1 8 6 35 32 15 32 31 32 22 35 35 39 38 38 37 39 38 39 38 38 38 38 4 39 4 4 39 4 8 8 33 33 33 33 4 31 33 38 17 17 17 17 17 17 17 17 17 10 9 9 9 9 9 9 9 9 9 10 7 10 10 7 10 9 10 10 29 29 29 10 10 10 10 10 9 8 9 6 9 9 9 9 9 9 9 9 7 9 9 18 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 | // SPDX-License-Identifier: GPL-2.0 /* * fs/ext4/extents_status.c * * Written by Yongqiang Yang <xiaoqiangnk@gmail.com> * Modified by * Allison Henderson <achender@linux.vnet.ibm.com> * Hugh Dickins <hughd@google.com> * Zheng Liu <wenqing.lz@taobao.com> * * Ext4 extents status tree core functions. */ #include <linux/list_sort.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include "ext4.h" #include <trace/events/ext4.h> /* * According to previous discussion in Ext4 Developer Workshop, we * will introduce a new structure called io tree to track all extent * status in order to solve some problems that we have met * (e.g. Reservation space warning), and provide extent-level locking. * Delay extent tree is the first step to achieve this goal. It is * original built by Yongqiang Yang. At that time it is called delay * extent tree, whose goal is only track delayed extents in memory to * simplify the implementation of fiemap and bigalloc, and introduce * lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called * delay extent tree at the first commit. But for better understand * what it does, it has been rename to extent status tree. * * Step1: * Currently the first step has been done. All delayed extents are * tracked in the tree. It maintains the delayed extent when a delayed * allocation is issued, and the delayed extent is written out or * invalidated. Therefore the implementation of fiemap and bigalloc * are simplified, and SEEK_DATA/SEEK_HOLE are introduced. * * The following comment describes the implemenmtation of extent * status tree and future works. * * Step2: * In this step all extent status are tracked by extent status tree. * Thus, we can first try to lookup a block mapping in this tree before * finding it in extent tree. Hence, single extent cache can be removed * because extent status tree can do a better job. Extents in status * tree are loaded on-demand. Therefore, the extent status tree may not * contain all of the extents in a file. Meanwhile we define a shrinker * to reclaim memory from extent status tree because fragmented extent * tree will make status tree cost too much memory. written/unwritten/- * hole extents in the tree will be reclaimed by this shrinker when we * are under high memory pressure. Delayed extents will not be * reclimed because fiemap, bigalloc, and seek_data/hole need it. */ /* * Extent status tree implementation for ext4. * * * ========================================================================== * Extent status tree tracks all extent status. * * 1. Why we need to implement extent status tree? * * Without extent status tree, ext4 identifies a delayed extent by looking * up page cache, this has several deficiencies - complicated, buggy, * and inefficient code. * * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a * block or a range of blocks are belonged to a delayed extent. * * Let us have a look at how they do without extent status tree. * -- FIEMAP * FIEMAP looks up page cache to identify delayed allocations from holes. * * -- SEEK_HOLE/DATA * SEEK_HOLE/DATA has the same problem as FIEMAP. * * -- bigalloc * bigalloc looks up page cache to figure out if a block is * already under delayed allocation or not to determine whether * quota reserving is needed for the cluster. * * -- writeout * Writeout looks up whole page cache to see if a buffer is * mapped, If there are not very many delayed buffers, then it is * time consuming. * * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA, * bigalloc and writeout can figure out if a block or a range of * blocks is under delayed allocation(belonged to a delayed extent) or * not by searching the extent tree. * * * ========================================================================== * 2. Ext4 extent status tree impelmentation * * -- extent * A extent is a range of blocks which are contiguous logically and * physically. Unlike extent in extent tree, this extent in ext4 is * a in-memory struct, there is no corresponding on-disk data. There * is no limit on length of extent, so an extent can contain as many * blocks as they are contiguous logically and physically. * * -- extent status tree * Every inode has an extent status tree and all allocation blocks * are added to the tree with different status. The extent in the * tree are ordered by logical block no. * * -- operations on a extent status tree * There are three important operations on a delayed extent tree: find * next extent, adding a extent(a range of blocks) and removing a extent. * * -- race on a extent status tree * Extent status tree is protected by inode->i_es_lock. * * -- memory consumption * Fragmented extent tree will make extent status tree cost too much * memory. Hence, we will reclaim written/unwritten/hole extents from * the tree under a heavy memory pressure. * * ========================================================================== * 3. Assurance of Ext4 extent status tree consistency * * When mapping blocks, Ext4 queries the extent status tree first and should * always trusts that the extent status tree is consistent and up to date. * Therefore, it is important to adheres to the following rules when createing, * modifying and removing extents. * * 1. Besides fastcommit replay, when Ext4 creates or queries block mappings, * the extent information should always be processed through the extent * status tree instead of being organized manually through the on-disk * extent tree. * * 2. When updating the extent tree, Ext4 should acquire the i_data_sem * exclusively and update the extent status tree atomically. If the extents * to be modified are large enough to exceed the range that a single * i_data_sem can process (as ext4_datasem_ensure_credits() may drop * i_data_sem to restart a transaction), it must (e.g. as ext4_punch_hole() * does): * * a) Hold the i_rwsem and invalidate_lock exclusively. This ensures * exclusion against page faults, as well as reads and writes that may * concurrently modify the extent status tree. * b) Evict all page cache in the affected range and recommend rebuilding * or dropping the extent status tree after modifying the on-disk * extent tree. This ensures exclusion against concurrent writebacks * that do not hold those locks but only holds a folio lock. * * 3. Based on the rules above, when querying block mappings, Ext4 should at * least hold the i_rwsem or invalidate_lock or folio lock(s) for the * specified querying range. * * ========================================================================== * 4. Performance analysis * * -- overhead * 1. There is a cache extent for write access, so if writes are * not very random, adding space operaions are in O(1) time. * * -- gain * 2. Code is much simpler, more readable, more maintainable and * more efficient. * * * ========================================================================== * 5. TODO list * * -- Refactor delayed space reservation * * -- Extent-level locking */ static struct kmem_cache *ext4_es_cachep; static struct kmem_cache *ext4_pending_cachep; static int __es_insert_extent(struct inode *inode, struct extent_status *newes, struct extent_status *prealloc); static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t end, int *reserved, struct extent_status *prealloc); static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan); static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, struct ext4_inode_info *locked_ei); static int __revise_pending(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, struct pending_reservation **prealloc); int __init ext4_init_es(void) { ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT); if (ext4_es_cachep == NULL) return -ENOMEM; return 0; } void ext4_exit_es(void) { kmem_cache_destroy(ext4_es_cachep); } void ext4_es_init_tree(struct ext4_es_tree *tree) { tree->root = RB_ROOT; tree->cache_es = NULL; } #ifdef ES_DEBUG__ static void ext4_es_print_tree(struct inode *inode) { struct ext4_es_tree *tree; struct rb_node *node; printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino); tree = &EXT4_I(inode)->i_es_tree; node = rb_first(&tree->root); while (node) { struct extent_status *es; es = rb_entry(node, struct extent_status, rb_node); printk(KERN_DEBUG " [%u/%u) %llu %x", es->es_lblk, es->es_len, ext4_es_pblock(es), ext4_es_status(es)); node = rb_next(node); } printk(KERN_DEBUG "\n"); } #else #define ext4_es_print_tree(inode) #endif static inline ext4_lblk_t ext4_es_end(struct extent_status *es) { BUG_ON(es->es_lblk + es->es_len < es->es_lblk); return es->es_lblk + es->es_len - 1; } static inline void ext4_es_inc_seq(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); WRITE_ONCE(ei->i_es_seq, ei->i_es_seq + 1); } /* * search through the tree for an delayed extent with a given offset. If * it can't be found, try to find next extent. */ static struct extent_status *__es_tree_search(struct rb_root *root, ext4_lblk_t lblk) { struct rb_node *node = root->rb_node; struct extent_status *es = NULL; while (node) { es = rb_entry(node, struct extent_status, rb_node); if (lblk < es->es_lblk) node = node->rb_left; else if (lblk > ext4_es_end(es)) node = node->rb_right; else return es; } if (es && lblk < es->es_lblk) return es; if (es && lblk > ext4_es_end(es)) { node = rb_next(&es->rb_node); return node ? rb_entry(node, struct extent_status, rb_node) : NULL; } return NULL; } /* * ext4_es_find_extent_range - find extent with specified status within block * range or next extent following block range in * extents status tree * * @inode - file containing the range * @matching_fn - pointer to function that matches extents with desired status * @lblk - logical block defining start of range * @end - logical block defining end of range * @es - extent found, if any * * Find the first extent within the block range specified by @lblk and @end * in the extents status tree that satisfies @matching_fn. If a match * is found, it's returned in @es. If not, and a matching extent is found * beyond the block range, it's returned in @es. If no match is found, an * extent is returned in @es whose es_lblk, es_len, and es_pblk components * are 0. */ static void __es_find_extent_range(struct inode *inode, int (*matching_fn)(struct extent_status *es), ext4_lblk_t lblk, ext4_lblk_t end, struct extent_status *es) { struct ext4_es_tree *tree = NULL; struct extent_status *es1 = NULL; struct rb_node *node; WARN_ON(es == NULL); WARN_ON(end < lblk); tree = &EXT4_I(inode)->i_es_tree; /* see if the extent has been cached */ es->es_lblk = es->es_len = es->es_pblk = 0; es1 = READ_ONCE(tree->cache_es); if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) { es_debug("%u cached by [%u/%u) %llu %x\n", lblk, es1->es_lblk, es1->es_len, ext4_es_pblock(es1), ext4_es_status(es1)); goto out; } es1 = __es_tree_search(&tree->root, lblk); out: if (es1 && !matching_fn(es1)) { while ((node = rb_next(&es1->rb_node)) != NULL) { es1 = rb_entry(node, struct extent_status, rb_node); if (es1->es_lblk > end) { es1 = NULL; break; } if (matching_fn(es1)) break; } } if (es1 && matching_fn(es1)) { WRITE_ONCE(tree->cache_es, es1); es->es_lblk = es1->es_lblk; es->es_len = es1->es_len; es->es_pblk = es1->es_pblk; } } /* * Locking for __es_find_extent_range() for external use */ void ext4_es_find_extent_range(struct inode *inode, int (*matching_fn)(struct extent_status *es), ext4_lblk_t lblk, ext4_lblk_t end, struct extent_status *es) { es->es_lblk = es->es_len = es->es_pblk = 0; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; trace_ext4_es_find_extent_range_enter(inode, lblk); read_lock(&EXT4_I(inode)->i_es_lock); __es_find_extent_range(inode, matching_fn, lblk, end, es); read_unlock(&EXT4_I(inode)->i_es_lock); trace_ext4_es_find_extent_range_exit(inode, es); } /* * __es_scan_range - search block range for block with specified status * in extents status tree * * @inode - file containing the range * @matching_fn - pointer to function that matches extents with desired status * @lblk - logical block defining start of range * @end - logical block defining end of range * * Returns true if at least one block in the specified block range satisfies * the criterion specified by @matching_fn, and false if not. If at least * one extent has the specified status, then there is at least one block * in the cluster with that status. Should only be called by code that has * taken i_es_lock. */ static bool __es_scan_range(struct inode *inode, int (*matching_fn)(struct extent_status *es), ext4_lblk_t start, ext4_lblk_t end) { struct extent_status es; __es_find_extent_range(inode, matching_fn, start, end, &es); if (es.es_len == 0) return false; /* no matching extent in the tree */ else if (es.es_lblk <= start && start < es.es_lblk + es.es_len) return true; else if (start <= es.es_lblk && es.es_lblk <= end) return true; else return false; } /* * Locking for __es_scan_range() for external use */ bool ext4_es_scan_range(struct inode *inode, int (*matching_fn)(struct extent_status *es), ext4_lblk_t lblk, ext4_lblk_t end) { bool ret; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return false; read_lock(&EXT4_I(inode)->i_es_lock); ret = __es_scan_range(inode, matching_fn, lblk, end); read_unlock(&EXT4_I(inode)->i_es_lock); return ret; } /* * __es_scan_clu - search cluster for block with specified status in * extents status tree * * @inode - file containing the cluster * @matching_fn - pointer to function that matches extents with desired status * @lblk - logical block in cluster to be searched * * Returns true if at least one extent in the cluster containing @lblk * satisfies the criterion specified by @matching_fn, and false if not. If at * least one extent has the specified status, then there is at least one block * in the cluster with that status. Should only be called by code that has * taken i_es_lock. */ static bool __es_scan_clu(struct inode *inode, int (*matching_fn)(struct extent_status *es), ext4_lblk_t lblk) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_lblk_t lblk_start, lblk_end; lblk_start = EXT4_LBLK_CMASK(sbi, lblk); lblk_end = lblk_start + sbi->s_cluster_ratio - 1; return __es_scan_range(inode, matching_fn, lblk_start, lblk_end); } /* * Locking for __es_scan_clu() for external use */ bool ext4_es_scan_clu(struct inode *inode, int (*matching_fn)(struct extent_status *es), ext4_lblk_t lblk) { bool ret; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return false; read_lock(&EXT4_I(inode)->i_es_lock); ret = __es_scan_clu(inode, matching_fn, lblk); read_unlock(&EXT4_I(inode)->i_es_lock); return ret; } static void ext4_es_list_add(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (!list_empty(&ei->i_es_list)) return; spin_lock(&sbi->s_es_lock); if (list_empty(&ei->i_es_list)) { list_add_tail(&ei->i_es_list, &sbi->s_es_list); sbi->s_es_nr_inode++; } spin_unlock(&sbi->s_es_lock); } static void ext4_es_list_del(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); spin_lock(&sbi->s_es_lock); if (!list_empty(&ei->i_es_list)) { list_del_init(&ei->i_es_list); sbi->s_es_nr_inode--; WARN_ON_ONCE(sbi->s_es_nr_inode < 0); } spin_unlock(&sbi->s_es_lock); } static inline struct pending_reservation *__alloc_pending(bool nofail) { if (!nofail) return kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC); return kmem_cache_zalloc(ext4_pending_cachep, GFP_KERNEL | __GFP_NOFAIL); } static inline void __free_pending(struct pending_reservation *pr) { kmem_cache_free(ext4_pending_cachep, pr); } /* * Returns true if we cannot fail to allocate memory for this extent_status * entry and cannot reclaim it until its status changes. */ static inline bool ext4_es_must_keep(struct extent_status *es) { /* fiemap, bigalloc, and seek_data/hole need to use it. */ if (ext4_es_is_delayed(es)) return true; return false; } static inline struct extent_status *__es_alloc_extent(bool nofail) { if (!nofail) return kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC); return kmem_cache_zalloc(ext4_es_cachep, GFP_KERNEL | __GFP_NOFAIL); } static void ext4_es_init_extent(struct inode *inode, struct extent_status *es, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk) { es->es_lblk = lblk; es->es_len = len; es->es_pblk = pblk; /* We never try to reclaim a must kept extent, so we don't count it. */ if (!ext4_es_must_keep(es)) { if (!EXT4_I(inode)->i_es_shk_nr++) ext4_es_list_add(inode); percpu_counter_inc(&EXT4_SB(inode->i_sb)-> s_es_stats.es_stats_shk_cnt); } EXT4_I(inode)->i_es_all_nr++; percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); } static inline void __es_free_extent(struct extent_status *es) { kmem_cache_free(ext4_es_cachep, es); } static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) { EXT4_I(inode)->i_es_all_nr--; percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); /* Decrease the shrink counter when we can reclaim the extent. */ if (!ext4_es_must_keep(es)) { BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0); if (!--EXT4_I(inode)->i_es_shk_nr) ext4_es_list_del(inode); percpu_counter_dec(&EXT4_SB(inode->i_sb)-> s_es_stats.es_stats_shk_cnt); } __es_free_extent(es); } /* * Check whether or not two extents can be merged * Condition: * - logical block number is contiguous * - physical block number is contiguous * - status is equal */ static int ext4_es_can_be_merged(struct extent_status *es1, struct extent_status *es2) { if (ext4_es_type(es1) != ext4_es_type(es2)) return 0; if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) { pr_warn("ES assertion failed when merging extents. " "The sum of lengths of es1 (%d) and es2 (%d) " "is bigger than allowed file size (%d)\n", es1->es_len, es2->es_len, EXT_MAX_BLOCKS); WARN_ON(1); return 0; } if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk) return 0; if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) && (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2))) return 1; if (ext4_es_is_hole(es1)) return 1; /* we need to check delayed extent */ if (ext4_es_is_delayed(es1)) return 1; return 0; } static struct extent_status * ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es) { struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; struct extent_status *es1; struct rb_node *node; node = rb_prev(&es->rb_node); if (!node) return es; es1 = rb_entry(node, struct extent_status, rb_node); if (ext4_es_can_be_merged(es1, es)) { es1->es_len += es->es_len; if (ext4_es_is_referenced(es)) ext4_es_set_referenced(es1); rb_erase(&es->rb_node, &tree->root); ext4_es_free_extent(inode, es); es = es1; } return es; } static struct extent_status * ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es) { struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; struct extent_status *es1; struct rb_node *node; node = rb_next(&es->rb_node); if (!node) return es; es1 = rb_entry(node, struct extent_status, rb_node); if (ext4_es_can_be_merged(es, es1)) { es->es_len += es1->es_len; if (ext4_es_is_referenced(es1)) ext4_es_set_referenced(es); rb_erase(node, &tree->root); ext4_es_free_extent(inode, es1); } return es; } #ifdef ES_AGGRESSIVE_TEST #include "ext4_extents.h" /* Needed when ES_AGGRESSIVE_TEST is defined */ static void ext4_es_insert_extent_ext_check(struct inode *inode, struct extent_status *es) { struct ext4_ext_path *path = NULL; struct ext4_extent *ex; ext4_lblk_t ee_block; ext4_fsblk_t ee_start; unsigned short ee_len; int depth, ee_status, es_status; path = ext4_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path)) return; depth = ext_depth(inode); ex = path[depth].p_ext; if (ex) { ee_block = le32_to_cpu(ex->ee_block); ee_start = ext4_ext_pblock(ex); ee_len = ext4_ext_get_actual_len(ex); ee_status = ext4_ext_is_unwritten(ex) ? 1 : 0; es_status = ext4_es_is_unwritten(es) ? 1 : 0; /* * Make sure ex and es are not overlap when we try to insert * a delayed/hole extent. */ if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) { if (in_range(es->es_lblk, ee_block, ee_len)) { pr_warn("ES insert assertion failed for " "inode: %lu we can find an extent " "at block [%d/%d/%llu/%c], but we " "want to add a delayed/hole extent " "[%d/%d/%llu/%x]\n", inode->i_ino, ee_block, ee_len, ee_start, ee_status ? 'u' : 'w', es->es_lblk, es->es_len, ext4_es_pblock(es), ext4_es_status(es)); } goto out; } /* * We don't check ee_block == es->es_lblk, etc. because es * might be a part of whole extent, vice versa. */ if (es->es_lblk < ee_block || ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) { pr_warn("ES insert assertion failed for inode: %lu " "ex_status [%d/%d/%llu/%c] != " "es_status [%d/%d/%llu/%c]\n", inode->i_ino, ee_block, ee_len, ee_start, ee_status ? 'u' : 'w', es->es_lblk, es->es_len, ext4_es_pblock(es), es_status ? 'u' : 'w'); goto out; } if (ee_status ^ es_status) { pr_warn("ES insert assertion failed for inode: %lu " "ex_status [%d/%d/%llu/%c] != " "es_status [%d/%d/%llu/%c]\n", inode->i_ino, ee_block, ee_len, ee_start, ee_status ? 'u' : 'w', es->es_lblk, es->es_len, ext4_es_pblock(es), es_status ? 'u' : 'w'); } } else { /* * We can't find an extent on disk. So we need to make sure * that we don't want to add an written/unwritten extent. */ if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) { pr_warn("ES insert assertion failed for inode: %lu " "can't find an extent at block %d but we want " "to add a written/unwritten extent " "[%d/%d/%llu/%x]\n", inode->i_ino, es->es_lblk, es->es_lblk, es->es_len, ext4_es_pblock(es), ext4_es_status(es)); } } out: ext4_free_ext_path(path); } static void ext4_es_insert_extent_ind_check(struct inode *inode, struct extent_status *es) { struct ext4_map_blocks map; int retval; /* * Here we call ext4_ind_map_blocks to lookup a block mapping because * 'Indirect' structure is defined in indirect.c. So we couldn't * access direct/indirect tree from outside. It is too dirty to define * this function in indirect.c file. */ map.m_lblk = es->es_lblk; map.m_len = es->es_len; retval = ext4_ind_map_blocks(NULL, inode, &map, 0); if (retval > 0) { if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) { /* * We want to add a delayed/hole extent but this * block has been allocated. */ pr_warn("ES insert assertion failed for inode: %lu " "We can find blocks but we want to add a " "delayed/hole extent [%d/%d/%llu/%x]\n", inode->i_ino, es->es_lblk, es->es_len, ext4_es_pblock(es), ext4_es_status(es)); return; } else if (ext4_es_is_written(es)) { if (retval != es->es_len) { pr_warn("ES insert assertion failed for " "inode: %lu retval %d != es_len %d\n", inode->i_ino, retval, es->es_len); return; } if (map.m_pblk != ext4_es_pblock(es)) { pr_warn("ES insert assertion failed for " "inode: %lu m_pblk %llu != " "es_pblk %llu\n", inode->i_ino, map.m_pblk, ext4_es_pblock(es)); return; } } else { /* * We don't need to check unwritten extent because * indirect-based file doesn't have it. */ BUG(); } } else if (retval == 0) { if (ext4_es_is_written(es)) { pr_warn("ES insert assertion failed for inode: %lu " "We can't find the block but we want to add " "a written extent [%d/%d/%llu/%x]\n", inode->i_ino, es->es_lblk, es->es_len, ext4_es_pblock(es), ext4_es_status(es)); return; } } } static inline void ext4_es_insert_extent_check(struct inode *inode, struct extent_status *es) { /* * We don't need to worry about the race condition because * caller takes i_data_sem locking. */ BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ext4_es_insert_extent_ext_check(inode, es); else ext4_es_insert_extent_ind_check(inode, es); } #else static inline void ext4_es_insert_extent_check(struct inode *inode, struct extent_status *es) { } #endif static int __es_insert_extent(struct inode *inode, struct extent_status *newes, struct extent_status *prealloc) { struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; struct rb_node **p = &tree->root.rb_node; struct rb_node *parent = NULL; struct extent_status *es; while (*p) { parent = *p; es = rb_entry(parent, struct extent_status, rb_node); if (newes->es_lblk < es->es_lblk) { if (ext4_es_can_be_merged(newes, es)) { /* * Here we can modify es_lblk directly * because it isn't overlapped. */ es->es_lblk = newes->es_lblk; es->es_len += newes->es_len; if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) ext4_es_store_pblock(es, newes->es_pblk); es = ext4_es_try_to_merge_left(inode, es); goto out; } p = &(*p)->rb_left; } else if (newes->es_lblk > ext4_es_end(es)) { if (ext4_es_can_be_merged(es, newes)) { es->es_len += newes->es_len; es = ext4_es_try_to_merge_right(inode, es); goto out; } p = &(*p)->rb_right; } else { BUG(); return -EINVAL; } } if (prealloc) es = prealloc; else es = __es_alloc_extent(false); if (!es) return -ENOMEM; ext4_es_init_extent(inode, es, newes->es_lblk, newes->es_len, newes->es_pblk); rb_link_node(&es->rb_node, parent, p); rb_insert_color(&es->rb_node, &tree->root); out: tree->cache_es = es; return 0; } /* * ext4_es_insert_extent() adds information to an inode's extent * status tree. */ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, unsigned int status, bool delalloc_reserve_used) { struct extent_status newes; ext4_lblk_t end = lblk + len - 1; int err1 = 0, err2 = 0, err3 = 0; int resv_used = 0, pending = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct extent_status *es1 = NULL; struct extent_status *es2 = NULL; struct pending_reservation *pr = NULL; bool revise_pending = false; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; es_debug("add [%u/%u) %llu %x %d to extent status tree of inode %lu\n", lblk, len, pblk, status, delalloc_reserve_used, inode->i_ino); if (!len) return; BUG_ON(end < lblk); WARN_ON_ONCE(status & EXTENT_STATUS_DELAYED); newes.es_lblk = lblk; newes.es_len = len; ext4_es_store_pblock_status(&newes, pblk, status); ext4_es_insert_extent_check(inode, &newes); revise_pending = sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) && (status & (EXTENT_STATUS_WRITTEN | EXTENT_STATUS_UNWRITTEN)); retry: if (err1 && !es1) es1 = __es_alloc_extent(true); if ((err1 || err2) && !es2) es2 = __es_alloc_extent(true); if ((err1 || err2 || err3 < 0) && revise_pending && !pr) pr = __alloc_pending(true); write_lock(&EXT4_I(inode)->i_es_lock); err1 = __es_remove_extent(inode, lblk, end, &resv_used, es1); if (err1 != 0) goto error; /* Free preallocated extent if it didn't get used. */ if (es1) { if (!es1->es_len) __es_free_extent(es1); es1 = NULL; } err2 = __es_insert_extent(inode, &newes, es2); if (err2 == -ENOMEM && !ext4_es_must_keep(&newes)) err2 = 0; if (err2 != 0) goto error; /* Free preallocated extent if it didn't get used. */ if (es2) { if (!es2->es_len) __es_free_extent(es2); es2 = NULL; } if (revise_pending) { err3 = __revise_pending(inode, lblk, len, &pr); if (err3 < 0) goto error; if (pr) { __free_pending(pr); pr = NULL; } pending = err3; } /* * TODO: For cache on-disk extents, there is no need to increment * the sequence counter, this requires future optimization. */ ext4_es_inc_seq(inode); error: write_unlock(&EXT4_I(inode)->i_es_lock); /* * Reduce the reserved cluster count to reflect successful deferred * allocation of delayed allocated clusters or direct allocation of * clusters discovered to be delayed allocated. Once allocated, a * cluster is not included in the reserved count. * * When direct allocating (from fallocate, filemap, DIO, or clusters * allocated when delalloc has been disabled by ext4_nonda_switch()) * an extent either 1) contains delayed blocks but start with * non-delayed allocated blocks (e.g. hole) or 2) contains non-delayed * allocated blocks which belong to delayed allocated clusters when * bigalloc feature is enabled, quota has already been claimed by * ext4_mb_new_blocks(), so release the quota reservations made for * any previously delayed allocated clusters instead of claim them * again. */ resv_used += pending; if (resv_used) ext4_da_update_reserve_space(inode, resv_used, delalloc_reserve_used); if (err1 || err2 || err3 < 0) goto retry; trace_ext4_es_insert_extent(inode, &newes); ext4_es_print_tree(inode); return; } /* * ext4_es_cache_extent() inserts information into the extent status * tree if and only if there isn't information about the range in * question already. */ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, unsigned int status) { struct extent_status *es; struct extent_status newes; ext4_lblk_t end = lblk + len - 1; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; newes.es_lblk = lblk; newes.es_len = len; ext4_es_store_pblock_status(&newes, pblk, status); trace_ext4_es_cache_extent(inode, &newes); if (!len) return; BUG_ON(end < lblk); write_lock(&EXT4_I(inode)->i_es_lock); es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk); if (!es || es->es_lblk > end) __es_insert_extent(inode, &newes, NULL); write_unlock(&EXT4_I(inode)->i_es_lock); } /* * ext4_es_lookup_extent() looks up an extent in extent status tree. * * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks. * * Return: 1 on found, 0 on not */ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t *next_lblk, struct extent_status *es, u64 *pseq) { struct ext4_es_tree *tree; struct ext4_es_stats *stats; struct extent_status *es1 = NULL; struct rb_node *node; int found = 0; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return 0; trace_ext4_es_lookup_extent_enter(inode, lblk); es_debug("lookup extent in block %u\n", lblk); tree = &EXT4_I(inode)->i_es_tree; read_lock(&EXT4_I(inode)->i_es_lock); /* find extent in cache firstly */ es->es_lblk = es->es_len = es->es_pblk = 0; es1 = READ_ONCE(tree->cache_es); if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) { es_debug("%u cached by [%u/%u)\n", lblk, es1->es_lblk, es1->es_len); found = 1; goto out; } node = tree->root.rb_node; while (node) { es1 = rb_entry(node, struct extent_status, rb_node); if (lblk < es1->es_lblk) node = node->rb_left; else if (lblk > ext4_es_end(es1)) node = node->rb_right; else { found = 1; break; } } out: stats = &EXT4_SB(inode->i_sb)->s_es_stats; if (found) { BUG_ON(!es1); es->es_lblk = es1->es_lblk; es->es_len = es1->es_len; es->es_pblk = es1->es_pblk; if (!ext4_es_is_referenced(es1)) ext4_es_set_referenced(es1); percpu_counter_inc(&stats->es_stats_cache_hits); if (next_lblk) { node = rb_next(&es1->rb_node); if (node) { es1 = rb_entry(node, struct extent_status, rb_node); *next_lblk = es1->es_lblk; } else *next_lblk = 0; } if (pseq) *pseq = EXT4_I(inode)->i_es_seq; } else { percpu_counter_inc(&stats->es_stats_cache_misses); } read_unlock(&EXT4_I(inode)->i_es_lock); trace_ext4_es_lookup_extent_exit(inode, es, found); return found; } struct rsvd_count { int ndelayed; bool first_do_lblk_found; ext4_lblk_t first_do_lblk; ext4_lblk_t last_do_lblk; struct extent_status *left_es; bool partial; ext4_lblk_t lclu; }; /* * init_rsvd - initialize reserved count data before removing block range * in file from extent status tree * * @inode - file containing range * @lblk - first block in range * @es - pointer to first extent in range * @rc - pointer to reserved count data * * Assumes es is not NULL */ static void init_rsvd(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es, struct rsvd_count *rc) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct rb_node *node; rc->ndelayed = 0; /* * for bigalloc, note the first delayed block in the range has not * been found, record the extent containing the block to the left of * the region to be removed, if any, and note that there's no partial * cluster to track */ if (sbi->s_cluster_ratio > 1) { rc->first_do_lblk_found = false; if (lblk > es->es_lblk) { rc->left_es = es; } else { node = rb_prev(&es->rb_node); rc->left_es = node ? rb_entry(node, struct extent_status, rb_node) : NULL; } rc->partial = false; } } /* * count_rsvd - count the clusters containing delayed blocks in a range * within an extent and add to the running tally in rsvd_count * * @inode - file containing extent * @lblk - first block in range * @len - length of range in blocks * @es - pointer to extent containing clusters to be counted * @rc - pointer to reserved count data * * Tracks partial clusters found at the beginning and end of extents so * they aren't overcounted when they span adjacent extents */ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len, struct extent_status *es, struct rsvd_count *rc) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_lblk_t i, end, nclu; if (!ext4_es_is_delayed(es)) return; WARN_ON(len <= 0); if (sbi->s_cluster_ratio == 1) { rc->ndelayed += (int) len; return; } /* bigalloc */ i = (lblk < es->es_lblk) ? es->es_lblk : lblk; end = lblk + (ext4_lblk_t) len - 1; end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end; /* record the first block of the first delayed extent seen */ if (!rc->first_do_lblk_found) { rc->first_do_lblk = i; rc->first_do_lblk_found = true; } /* update the last lblk in the region seen so far */ rc->last_do_lblk = end; /* * if we're tracking a partial cluster and the current extent * doesn't start with it, count it and stop tracking */ if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) { rc->ndelayed++; rc->partial = false; } /* * if the first cluster doesn't start on a cluster boundary but * ends on one, count it */ if (EXT4_LBLK_COFF(sbi, i) != 0) { if (end >= EXT4_LBLK_CFILL(sbi, i)) { rc->ndelayed++; rc->partial = false; i = EXT4_LBLK_CFILL(sbi, i) + 1; } } /* * if the current cluster starts on a cluster boundary, count the * number of whole delayed clusters in the extent */ if ((i + sbi->s_cluster_ratio - 1) <= end) { nclu = (end - i + 1) >> sbi->s_cluster_bits; rc->ndelayed += nclu; i += nclu << sbi->s_cluster_bits; } /* * start tracking a partial cluster if there's a partial at the end * of the current extent and we're not already tracking one */ if (!rc->partial && i <= end) { rc->partial = true; rc->lclu = EXT4_B2C(sbi, i); } } /* * __pr_tree_search - search for a pending cluster reservation * * @root - root of pending reservation tree * @lclu - logical cluster to search for * * Returns the pending reservation for the cluster identified by @lclu * if found. If not, returns a reservation for the next cluster if any, * and if not, returns NULL. */ static struct pending_reservation *__pr_tree_search(struct rb_root *root, ext4_lblk_t lclu) { struct rb_node *node = root->rb_node; struct pending_reservation *pr = NULL; while (node) { pr = rb_entry(node, struct pending_reservation, rb_node); if (lclu < pr->lclu) node = node->rb_left; else if (lclu > pr->lclu) node = node->rb_right; else return pr; } if (pr && lclu < pr->lclu) return pr; if (pr && lclu > pr->lclu) { node = rb_next(&pr->rb_node); return node ? rb_entry(node, struct pending_reservation, rb_node) : NULL; } return NULL; } /* * get_rsvd - calculates and returns the number of cluster reservations to be * released when removing a block range from the extent status tree * and releases any pending reservations within the range * * @inode - file containing block range * @end - last block in range * @right_es - pointer to extent containing next block beyond end or NULL * @rc - pointer to reserved count data * * The number of reservations to be released is equal to the number of * clusters containing delayed blocks within the range, minus the number of * clusters still containing delayed blocks at the ends of the range, and * minus the number of pending reservations within the range. */ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, struct extent_status *right_es, struct rsvd_count *rc) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct pending_reservation *pr; struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree; struct rb_node *node; ext4_lblk_t first_lclu, last_lclu; bool left_delayed, right_delayed, count_pending; struct extent_status *es; if (sbi->s_cluster_ratio > 1) { /* count any remaining partial cluster */ if (rc->partial) rc->ndelayed++; if (rc->ndelayed == 0) return 0; first_lclu = EXT4_B2C(sbi, rc->first_do_lblk); last_lclu = EXT4_B2C(sbi, rc->last_do_lblk); /* * decrease the delayed count by the number of clusters at the * ends of the range that still contain delayed blocks - * these clusters still need to be reserved */ left_delayed = right_delayed = false; es = rc->left_es; while (es && ext4_es_end(es) >= EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) { if (ext4_es_is_delayed(es)) { rc->ndelayed--; left_delayed = true; break; } node = rb_prev(&es->rb_node); if (!node) break; es = rb_entry(node, struct extent_status, rb_node); } if (right_es && (!left_delayed || first_lclu != last_lclu)) { if (end < ext4_es_end(right_es)) { es = right_es; } else { node = rb_next(&right_es->rb_node); es = node ? rb_entry(node, struct extent_status, rb_node) : NULL; } while (es && es->es_lblk <= EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) { if (ext4_es_is_delayed(es)) { rc->ndelayed--; right_delayed = true; break; } node = rb_next(&es->rb_node); if (!node) break; es = rb_entry(node, struct extent_status, rb_node); } } /* * Determine the block range that should be searched for * pending reservations, if any. Clusters on the ends of the * original removed range containing delayed blocks are * excluded. They've already been accounted for and it's not * possible to determine if an associated pending reservation * should be released with the information available in the * extents status tree. */ if (first_lclu == last_lclu) { if (left_delayed | right_delayed) count_pending = false; else count_pending = true; } else { if (left_delayed) first_lclu++; if (right_delayed) last_lclu--; if (first_lclu <= last_lclu) count_pending = true; else count_pending = false; } /* * a pending reservation found between first_lclu and last_lclu * represents an allocated cluster that contained at least one * delayed block, so the delayed total must be reduced by one * for each pending reservation found and released */ if (count_pending) { pr = __pr_tree_search(&tree->root, first_lclu); while (pr && pr->lclu <= last_lclu) { rc->ndelayed--; node = rb_next(&pr->rb_node); rb_erase(&pr->rb_node, &tree->root); __free_pending(pr); if (!node) break; pr = rb_entry(node, struct pending_reservation, rb_node); } } } return rc->ndelayed; } /* * __es_remove_extent - removes block range from extent status tree * * @inode - file containing range * @lblk - first block in range * @end - last block in range * @reserved - number of cluster reservations released * @prealloc - pre-allocated es to avoid memory allocation failures * * If @reserved is not NULL and delayed allocation is enabled, counts * block/cluster reservations freed by removing range and if bigalloc * enabled cancels pending reservations as needed. Returns 0 on success, * error code on failure. */ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t end, int *reserved, struct extent_status *prealloc) { struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; struct rb_node *node; struct extent_status *es; struct extent_status orig_es; ext4_lblk_t len1, len2; ext4_fsblk_t block; int err = 0; bool count_reserved = true; struct rsvd_count rc; if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC)) count_reserved = false; es = __es_tree_search(&tree->root, lblk); if (!es) goto out; if (es->es_lblk > end) goto out; /* Simply invalidate cache_es. */ tree->cache_es = NULL; if (count_reserved) init_rsvd(inode, lblk, es, &rc); orig_es.es_lblk = es->es_lblk; orig_es.es_len = es->es_len; orig_es.es_pblk = es->es_pblk; len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0; len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0; if (len1 > 0) es->es_len = len1; if (len2 > 0) { if (len1 > 0) { struct extent_status newes; newes.es_lblk = end + 1; newes.es_len = len2; block = 0x7FDEADBEEFULL; if (ext4_es_is_written(&orig_es) || ext4_es_is_unwritten(&orig_es)) block = ext4_es_pblock(&orig_es) + orig_es.es_len - len2; ext4_es_store_pblock_status(&newes, block, ext4_es_status(&orig_es)); err = __es_insert_extent(inode, &newes, prealloc); if (err) { if (!ext4_es_must_keep(&newes)) return 0; es->es_lblk = orig_es.es_lblk; es->es_len = orig_es.es_len; goto out; } } else { es->es_lblk = end + 1; es->es_len = len2; if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) { block = orig_es.es_pblk + orig_es.es_len - len2; ext4_es_store_pblock(es, block); } } if (count_reserved) count_rsvd(inode, orig_es.es_lblk + len1, orig_es.es_len - len1 - len2, &orig_es, &rc); goto out_get_reserved; } if (len1 > 0) { if (count_reserved) count_rsvd(inode, lblk, orig_es.es_len - len1, &orig_es, &rc); node = rb_next(&es->rb_node); if (node) es = rb_entry(node, struct extent_status, rb_node); else es = NULL; } while (es && ext4_es_end(es) <= end) { if (count_reserved) count_rsvd(inode, es->es_lblk, es->es_len, es, &rc); node = rb_next(&es->rb_node); rb_erase(&es->rb_node, &tree->root); ext4_es_free_extent(inode, es); if (!node) { es = NULL; break; } es = rb_entry(node, struct extent_status, rb_node); } if (es && es->es_lblk < end + 1) { ext4_lblk_t orig_len = es->es_len; len1 = ext4_es_end(es) - end; if (count_reserved) count_rsvd(inode, es->es_lblk, orig_len - len1, es, &rc); es->es_lblk = end + 1; es->es_len = len1; if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) { block = es->es_pblk + orig_len - len1; ext4_es_store_pblock(es, block); } } out_get_reserved: if (count_reserved) *reserved = get_rsvd(inode, end, es, &rc); out: return err; } /* * ext4_es_remove_extent - removes block range from extent status tree * * @inode - file containing range * @lblk - first block in range * @len - number of blocks to remove * * Reduces block/cluster reservation count and for bigalloc cancels pending * reservations as needed. */ void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len) { ext4_lblk_t end; int err = 0; int reserved = 0; struct extent_status *es = NULL; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; es_debug("remove [%u/%u) from extent status tree of inode %lu\n", lblk, len, inode->i_ino); if (!len) return; end = lblk + len - 1; BUG_ON(end < lblk); retry: if (err && !es) es = __es_alloc_extent(true); /* * ext4_clear_inode() depends on us taking i_es_lock unconditionally * so that we are sure __es_shrink() is done with the inode before it * is reclaimed. */ write_lock(&EXT4_I(inode)->i_es_lock); err = __es_remove_extent(inode, lblk, end, &reserved, es); if (err) goto error; /* Free preallocated extent if it didn't get used. */ if (es) { if (!es->es_len) __es_free_extent(es); es = NULL; } ext4_es_inc_seq(inode); error: write_unlock(&EXT4_I(inode)->i_es_lock); if (err) goto retry; trace_ext4_es_remove_extent(inode, lblk, len); ext4_es_print_tree(inode); ext4_da_release_space(inode, reserved); } static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, struct ext4_inode_info *locked_ei) { struct ext4_inode_info *ei; struct ext4_es_stats *es_stats; ktime_t start_time; u64 scan_time; int nr_to_walk; int nr_shrunk = 0; int retried = 0, nr_skipped = 0; es_stats = &sbi->s_es_stats; start_time = ktime_get(); retry: spin_lock(&sbi->s_es_lock); nr_to_walk = sbi->s_es_nr_inode; while (nr_to_walk-- > 0) { if (list_empty(&sbi->s_es_list)) { spin_unlock(&sbi->s_es_lock); goto out; } ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info, i_es_list); /* Move the inode to the tail */ list_move_tail(&ei->i_es_list, &sbi->s_es_list); /* * Normally we try hard to avoid shrinking precached inodes, * but we will as a last resort. */ if (!retried && ext4_test_inode_state(&ei->vfs_inode, EXT4_STATE_EXT_PRECACHED)) { nr_skipped++; continue; } if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) { nr_skipped++; continue; } /* * Now we hold i_es_lock which protects us from inode reclaim * freeing inode under us */ spin_unlock(&sbi->s_es_lock); nr_shrunk += es_reclaim_extents(ei, &nr_to_scan); write_unlock(&ei->i_es_lock); if (nr_to_scan <= 0) goto out; spin_lock(&sbi->s_es_lock); } spin_unlock(&sbi->s_es_lock); /* * If we skipped any inodes, and we weren't able to make any * forward progress, try again to scan precached inodes. */ if ((nr_shrunk == 0) && nr_skipped && !retried) { retried++; goto retry; } if (locked_ei && nr_shrunk == 0) nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan); out: scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); if (likely(es_stats->es_stats_scan_time)) es_stats->es_stats_scan_time = (scan_time + es_stats->es_stats_scan_time*3) / 4; else es_stats->es_stats_scan_time = scan_time; if (scan_time > es_stats->es_stats_max_scan_time) es_stats->es_stats_max_scan_time = scan_time; if (likely(es_stats->es_stats_shrunk)) es_stats->es_stats_shrunk = (nr_shrunk + es_stats->es_stats_shrunk*3) / 4; else es_stats->es_stats_shrunk = nr_shrunk; trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, nr_skipped, retried); return nr_shrunk; } static unsigned long ext4_es_count(struct shrinker *shrink, struct shrink_control *sc) { unsigned long nr; struct ext4_sb_info *sbi; sbi = shrink->private_data; nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr); return nr; } static unsigned long ext4_es_scan(struct shrinker *shrink, struct shrink_control *sc) { struct ext4_sb_info *sbi = shrink->private_data; int nr_to_scan = sc->nr_to_scan; int ret, nr_shrunk; ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret); nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL); ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret); return nr_shrunk; } int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v) { struct ext4_sb_info *sbi = EXT4_SB((struct super_block *) seq->private); struct ext4_es_stats *es_stats = &sbi->s_es_stats; struct ext4_inode_info *ei, *max = NULL; unsigned int inode_cnt = 0; if (v != SEQ_START_TOKEN) return 0; /* here we just find an inode that has the max nr. of objects */ spin_lock(&sbi->s_es_lock); list_for_each_entry(ei, &sbi->s_es_list, i_es_list) { inode_cnt++; if (max && max->i_es_all_nr < ei->i_es_all_nr) max = ei; else if (!max) max = ei; } spin_unlock(&sbi->s_es_lock); seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n", percpu_counter_sum_positive(&es_stats->es_stats_all_cnt), percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt)); seq_printf(seq, " %lld/%lld cache hits/misses\n", percpu_counter_sum_positive(&es_stats->es_stats_cache_hits), percpu_counter_sum_positive(&es_stats->es_stats_cache_misses)); if (inode_cnt) seq_printf(seq, " %d inodes on list\n", inode_cnt); seq_printf(seq, "average:\n %llu us scan time\n", div_u64(es_stats->es_stats_scan_time, 1000)); seq_printf(seq, " %lu shrunk objects\n", es_stats->es_stats_shrunk); if (inode_cnt) seq_printf(seq, "maximum:\n %lu inode (%u objects, %u reclaimable)\n" " %llu us max scan time\n", max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr, div_u64(es_stats->es_stats_max_scan_time, 1000)); return 0; } int ext4_es_register_shrinker(struct ext4_sb_info *sbi) { int err; /* Make sure we have enough bits for physical block number */ BUILD_BUG_ON(ES_SHIFT < 48); INIT_LIST_HEAD(&sbi->s_es_list); sbi->s_es_nr_inode = 0; spin_lock_init(&sbi->s_es_lock); sbi->s_es_stats.es_stats_shrunk = 0; err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_hits, 0, GFP_KERNEL); if (err) return err; err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_misses, 0, GFP_KERNEL); if (err) goto err1; sbi->s_es_stats.es_stats_scan_time = 0; sbi->s_es_stats.es_stats_max_scan_time = 0; err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL); if (err) goto err2; err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL); if (err) goto err3; sbi->s_es_shrinker = shrinker_alloc(0, "ext4-es:%s", sbi->s_sb->s_id); if (!sbi->s_es_shrinker) { err = -ENOMEM; goto err4; } sbi->s_es_shrinker->scan_objects = ext4_es_scan; sbi->s_es_shrinker->count_objects = ext4_es_count; sbi->s_es_shrinker->private_data = sbi; shrinker_register(sbi->s_es_shrinker); return 0; err4: percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); err3: percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); err2: percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses); err1: percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits); return err; } void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) { percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits); percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses); percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); shrinker_free(sbi->s_es_shrinker); } /* * Shrink extents in given inode from ei->i_es_shrink_lblk till end. Scan at * most *nr_to_scan extents, update *nr_to_scan accordingly. * * Return 0 if we hit end of tree / interval, 1 if we exhausted nr_to_scan. * Increment *nr_shrunk by the number of reclaimed extents. Also update * ei->i_es_shrink_lblk to where we should continue scanning. */ static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end, int *nr_to_scan, int *nr_shrunk) { struct inode *inode = &ei->vfs_inode; struct ext4_es_tree *tree = &ei->i_es_tree; struct extent_status *es; struct rb_node *node; es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk); if (!es) goto out_wrap; while (*nr_to_scan > 0) { if (es->es_lblk > end) { ei->i_es_shrink_lblk = end + 1; return 0; } (*nr_to_scan)--; node = rb_next(&es->rb_node); if (ext4_es_must_keep(es)) goto next; if (ext4_es_is_referenced(es)) { ext4_es_clear_referenced(es); goto next; } rb_erase(&es->rb_node, &tree->root); ext4_es_free_extent(inode, es); (*nr_shrunk)++; next: if (!node) goto out_wrap; es = rb_entry(node, struct extent_status, rb_node); } ei->i_es_shrink_lblk = es->es_lblk; return 1; out_wrap: ei->i_es_shrink_lblk = 0; return 0; } static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan) { struct inode *inode = &ei->vfs_inode; int nr_shrunk = 0; ext4_lblk_t start = ei->i_es_shrink_lblk; static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); if (ei->i_es_shk_nr == 0) return 0; if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) && __ratelimit(&_rs)) ext4_warning(inode->i_sb, "forced shrink of precached extents"); if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) && start != 0) es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk); ei->i_es_tree.cache_es = NULL; return nr_shrunk; } /* * Called to support EXT4_IOC_CLEAR_ES_CACHE. We can only remove * discretionary entries from the extent status cache. (Some entries * must be present for proper operations.) */ void ext4_clear_inode_es(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct extent_status *es; struct ext4_es_tree *tree; struct rb_node *node; write_lock(&ei->i_es_lock); tree = &EXT4_I(inode)->i_es_tree; tree->cache_es = NULL; node = rb_first(&tree->root); while (node) { es = rb_entry(node, struct extent_status, rb_node); node = rb_next(node); if (!ext4_es_must_keep(es)) { rb_erase(&es->rb_node, &tree->root); ext4_es_free_extent(inode, es); } } ext4_clear_inode_state(inode, EXT4_STATE_EXT_PRECACHED); write_unlock(&ei->i_es_lock); } #ifdef ES_DEBUG__ static void ext4_print_pending_tree(struct inode *inode) { struct ext4_pending_tree *tree; struct rb_node *node; struct pending_reservation *pr; printk(KERN_DEBUG "pending reservations for inode %lu:", inode->i_ino); tree = &EXT4_I(inode)->i_pending_tree; node = rb_first(&tree->root); while (node) { pr = rb_entry(node, struct pending_reservation, rb_node); printk(KERN_DEBUG " %u", pr->lclu); node = rb_next(node); } printk(KERN_DEBUG "\n"); } #else #define ext4_print_pending_tree(inode) #endif int __init ext4_init_pending(void) { ext4_pending_cachep = KMEM_CACHE(pending_reservation, SLAB_RECLAIM_ACCOUNT); if (ext4_pending_cachep == NULL) return -ENOMEM; return 0; } void ext4_exit_pending(void) { kmem_cache_destroy(ext4_pending_cachep); } void ext4_init_pending_tree(struct ext4_pending_tree *tree) { tree->root = RB_ROOT; } /* * __get_pending - retrieve a pointer to a pending reservation * * @inode - file containing the pending cluster reservation * @lclu - logical cluster of interest * * Returns a pointer to a pending reservation if it's a member of * the set, and NULL if not. Must be called holding i_es_lock. */ static struct pending_reservation *__get_pending(struct inode *inode, ext4_lblk_t lclu) { struct ext4_pending_tree *tree; struct rb_node *node; struct pending_reservation *pr = NULL; tree = &EXT4_I(inode)->i_pending_tree; node = (&tree->root)->rb_node; while (node) { pr = rb_entry(node, struct pending_reservation, rb_node); if (lclu < pr->lclu) node = node->rb_left; else if (lclu > pr->lclu) node = node->rb_right; else if (lclu == pr->lclu) return pr; } return NULL; } /* * __insert_pending - adds a pending cluster reservation to the set of * pending reservations * * @inode - file containing the cluster * @lblk - logical block in the cluster to be added * @prealloc - preallocated pending entry * * Returns 1 on successful insertion and -ENOMEM on failure. If the * pending reservation is already in the set, returns successfully. */ static int __insert_pending(struct inode *inode, ext4_lblk_t lblk, struct pending_reservation **prealloc) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree; struct rb_node **p = &tree->root.rb_node; struct rb_node *parent = NULL; struct pending_reservation *pr; ext4_lblk_t lclu; int ret = 0; lclu = EXT4_B2C(sbi, lblk); /* search to find parent for insertion */ while (*p) { parent = *p; pr = rb_entry(parent, struct pending_reservation, rb_node); if (lclu < pr->lclu) { p = &(*p)->rb_left; } else if (lclu > pr->lclu) { p = &(*p)->rb_right; } else { /* pending reservation already inserted */ goto out; } } if (likely(*prealloc == NULL)) { pr = __alloc_pending(false); if (!pr) { ret = -ENOMEM; goto out; } } else { pr = *prealloc; *prealloc = NULL; } pr->lclu = lclu; rb_link_node(&pr->rb_node, parent, p); rb_insert_color(&pr->rb_node, &tree->root); ret = 1; out: return ret; } /* * __remove_pending - removes a pending cluster reservation from the set * of pending reservations * * @inode - file containing the cluster * @lblk - logical block in the pending cluster reservation to be removed * * Returns successfully if pending reservation is not a member of the set. */ static void __remove_pending(struct inode *inode, ext4_lblk_t lblk) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct pending_reservation *pr; struct ext4_pending_tree *tree; pr = __get_pending(inode, EXT4_B2C(sbi, lblk)); if (pr != NULL) { tree = &EXT4_I(inode)->i_pending_tree; rb_erase(&pr->rb_node, &tree->root); __free_pending(pr); } } /* * ext4_remove_pending - removes a pending cluster reservation from the set * of pending reservations * * @inode - file containing the cluster * @lblk - logical block in the pending cluster reservation to be removed * * Locking for external use of __remove_pending. */ void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk) { struct ext4_inode_info *ei = EXT4_I(inode); write_lock(&ei->i_es_lock); __remove_pending(inode, lblk); write_unlock(&ei->i_es_lock); } /* * ext4_is_pending - determine whether a cluster has a pending reservation * on it * * @inode - file containing the cluster * @lblk - logical block in the cluster * * Returns true if there's a pending reservation for the cluster in the * set of pending reservations, and false if not. */ bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); bool ret; read_lock(&ei->i_es_lock); ret = (bool)(__get_pending(inode, EXT4_B2C(sbi, lblk)) != NULL); read_unlock(&ei->i_es_lock); return ret; } /* * ext4_es_insert_delayed_extent - adds some delayed blocks to the extents * status tree, adding a pending reservation * where needed * * @inode - file containing the newly added block * @lblk - start logical block to be added * @len - length of blocks to be added * @lclu_allocated/end_allocated - indicates whether a physical cluster has * been allocated for the logical cluster * that contains the start/end block. Note that * end_allocated should always be set to false * if the start and the end block are in the * same cluster */ void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, bool lclu_allocated, bool end_allocated) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct extent_status newes; ext4_lblk_t end = lblk + len - 1; int err1 = 0, err2 = 0, err3 = 0; struct extent_status *es1 = NULL; struct extent_status *es2 = NULL; struct pending_reservation *pr1 = NULL; struct pending_reservation *pr2 = NULL; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; es_debug("add [%u/%u) delayed to extent status tree of inode %lu\n", lblk, len, inode->i_ino); if (!len) return; WARN_ON_ONCE((EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) && end_allocated); newes.es_lblk = lblk; newes.es_len = len; ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED); ext4_es_insert_extent_check(inode, &newes); retry: if (err1 && !es1) es1 = __es_alloc_extent(true); if ((err1 || err2) && !es2) es2 = __es_alloc_extent(true); if (err1 || err2 || err3 < 0) { if (lclu_allocated && !pr1) pr1 = __alloc_pending(true); if (end_allocated && !pr2) pr2 = __alloc_pending(true); } write_lock(&EXT4_I(inode)->i_es_lock); err1 = __es_remove_extent(inode, lblk, end, NULL, es1); if (err1 != 0) goto error; /* Free preallocated extent if it didn't get used. */ if (es1) { if (!es1->es_len) __es_free_extent(es1); es1 = NULL; } err2 = __es_insert_extent(inode, &newes, es2); if (err2 != 0) goto error; /* Free preallocated extent if it didn't get used. */ if (es2) { if (!es2->es_len) __es_free_extent(es2); es2 = NULL; } if (lclu_allocated) { err3 = __insert_pending(inode, lblk, &pr1); if (err3 < 0) goto error; if (pr1) { __free_pending(pr1); pr1 = NULL; } } if (end_allocated) { err3 = __insert_pending(inode, end, &pr2); if (err3 < 0) goto error; if (pr2) { __free_pending(pr2); pr2 = NULL; } } ext4_es_inc_seq(inode); error: write_unlock(&EXT4_I(inode)->i_es_lock); if (err1 || err2 || err3 < 0) goto retry; trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated, end_allocated); ext4_es_print_tree(inode); ext4_print_pending_tree(inode); return; } /* * __revise_pending - makes, cancels, or leaves unchanged pending cluster * reservations for a specified block range depending * upon the presence or absence of delayed blocks * outside the range within clusters at the ends of the * range * * @inode - file containing the range * @lblk - logical block defining the start of range * @len - length of range in blocks * @prealloc - preallocated pending entry * * Used after a newly allocated extent is added to the extents status tree. * Requires that the extents in the range have either written or unwritten * status. Must be called while holding i_es_lock. Returns number of new * inserts pending cluster on insert pendings, returns 0 on remove pendings, * return -ENOMEM on failure. */ static int __revise_pending(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, struct pending_reservation **prealloc) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_lblk_t end = lblk + len - 1; ext4_lblk_t first, last; bool f_del = false, l_del = false; int pendings = 0; int ret = 0; if (len == 0) return 0; /* * Two cases - block range within single cluster and block range * spanning two or more clusters. Note that a cluster belonging * to a range starting and/or ending on a cluster boundary is treated * as if it does not contain a delayed extent. The new range may * have allocated space for previously delayed blocks out to the * cluster boundary, requiring that any pre-existing pending * reservation be canceled. Because this code only looks at blocks * outside the range, it should revise pending reservations * correctly even if the extent represented by the range can't be * inserted in the extents status tree due to ENOSPC. */ if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) { first = EXT4_LBLK_CMASK(sbi, lblk); if (first != lblk) f_del = __es_scan_range(inode, &ext4_es_is_delayed, first, lblk - 1); if (f_del) { ret = __insert_pending(inode, first, prealloc); if (ret < 0) goto out; pendings += ret; } else { last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1; if (last != end) l_del = __es_scan_range(inode, &ext4_es_is_delayed, end + 1, last); if (l_del) { ret = __insert_pending(inode, last, prealloc); if (ret < 0) goto out; pendings += ret; } else __remove_pending(inode, last); } } else { first = EXT4_LBLK_CMASK(sbi, lblk); if (first != lblk) f_del = __es_scan_range(inode, &ext4_es_is_delayed, first, lblk - 1); if (f_del) { ret = __insert_pending(inode, first, prealloc); if (ret < 0) goto out; pendings += ret; } else __remove_pending(inode, first); last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1; if (last != end) l_del = __es_scan_range(inode, &ext4_es_is_delayed, end + 1, last); if (l_del) { ret = __insert_pending(inode, last, prealloc); if (ret < 0) goto out; pendings += ret; } else __remove_pending(inode, last); } out: return (ret < 0) ? ret : pendings; } |
| 3 2 1 1 1 1 22 2 22 2 22 22 22 3 3 21 21 21 21 21 21 21 21 21 1 1 1 1 1 3 3 1 3 4 2 4 6 6 6 355 1698 355 1699 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/core/netprio_cgroup.c Priority Control Group * * Authors: Neil Horman <nhorman@tuxdriver.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/cgroup.h> #include <linux/rcupdate.h> #include <linux/atomic.h> #include <linux/sched/task.h> #include <net/rtnetlink.h> #include <net/pkt_cls.h> #include <net/sock.h> #include <net/netprio_cgroup.h> #include <linux/fdtable.h> /* * netprio allocates per-net_device priomap array which is indexed by * css->id. Limiting css ID to 16bits doesn't lose anything. */ #define NETPRIO_ID_MAX USHRT_MAX #define PRIOMAP_MIN_SZ 128 /* * Extend @dev->priomap so that it's large enough to accommodate * @target_idx. @dev->priomap.priomap_len > @target_idx after successful * return. Must be called under rtnl lock. */ static int extend_netdev_table(struct net_device *dev, u32 target_idx) { struct netprio_map *old, *new; size_t new_sz, new_len; /* is the existing priomap large enough? */ old = rtnl_dereference(dev->priomap); if (old && old->priomap_len > target_idx) return 0; /* * Determine the new size. Let's keep it power-of-two. We start * from PRIOMAP_MIN_SZ and double it until it's large enough to * accommodate @target_idx. */ new_sz = PRIOMAP_MIN_SZ; while (true) { new_len = (new_sz - offsetof(struct netprio_map, priomap)) / sizeof(new->priomap[0]); if (new_len > target_idx) break; new_sz *= 2; /* overflowed? */ if (WARN_ON(new_sz < PRIOMAP_MIN_SZ)) return -ENOSPC; } /* allocate & copy */ new = kzalloc(new_sz, GFP_KERNEL); if (!new) return -ENOMEM; if (old) memcpy(new->priomap, old->priomap, old->priomap_len * sizeof(old->priomap[0])); new->priomap_len = new_len; /* install the new priomap */ rcu_assign_pointer(dev->priomap, new); if (old) kfree_rcu(old, rcu); return 0; } /** * netprio_prio - return the effective netprio of a cgroup-net_device pair * @css: css part of the target pair * @dev: net_device part of the target pair * * Should be called under RCU read or rtnl lock. */ static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev) { struct netprio_map *map = rcu_dereference_rtnl(dev->priomap); int id = css->id; if (map && id < map->priomap_len) return map->priomap[id]; return 0; } /** * netprio_set_prio - set netprio on a cgroup-net_device pair * @css: css part of the target pair * @dev: net_device part of the target pair * @prio: prio to set * * Set netprio to @prio on @css-@dev pair. Should be called under rtnl * lock and may fail under memory pressure for non-zero @prio. */ static int netprio_set_prio(struct cgroup_subsys_state *css, struct net_device *dev, u32 prio) { struct netprio_map *map; int id = css->id; int ret; /* avoid extending priomap for zero writes */ map = rtnl_dereference(dev->priomap); if (!prio && (!map || map->priomap_len <= id)) return 0; ret = extend_netdev_table(dev, id); if (ret) return ret; map = rtnl_dereference(dev->priomap); map->priomap[id] = prio; return 0; } static struct cgroup_subsys_state * cgrp_css_alloc(struct cgroup_subsys_state *parent_css) { struct cgroup_subsys_state *css; css = kzalloc(sizeof(*css), GFP_KERNEL); if (!css) return ERR_PTR(-ENOMEM); return css; } static int cgrp_css_online(struct cgroup_subsys_state *css) { struct cgroup_subsys_state *parent_css = css->parent; struct net_device *dev; int ret = 0; if (css->id > NETPRIO_ID_MAX) return -ENOSPC; if (!parent_css) return 0; rtnl_lock(); /* * Inherit prios from the parent. As all prios are set during * onlining, there is no need to clear them on offline. */ for_each_netdev(&init_net, dev) { u32 prio = netprio_prio(parent_css, dev); ret = netprio_set_prio(css, dev, prio); if (ret) break; } rtnl_unlock(); return ret; } static void cgrp_css_free(struct cgroup_subsys_state *css) { kfree(css); } static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft) { return css->id; } static int read_priomap(struct seq_file *sf, void *v) { struct net_device *dev; rcu_read_lock(); for_each_netdev_rcu(&init_net, dev) seq_printf(sf, "%s %u\n", dev->name, netprio_prio(seq_css(sf), dev)); rcu_read_unlock(); return 0; } static ssize_t write_priomap(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { char devname[IFNAMSIZ + 1]; struct net_device *dev; u32 prio; int ret; if (sscanf(buf, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2) return -EINVAL; dev = dev_get_by_name(&init_net, devname); if (!dev) return -ENODEV; rtnl_lock(); ret = netprio_set_prio(of_css(of), dev, prio); rtnl_unlock(); dev_put(dev); return ret ?: nbytes; } static int update_netprio(const void *v, struct file *file, unsigned n) { struct socket *sock = sock_from_file(file); if (sock) sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data, (unsigned long)v); return 0; } static void net_prio_attach(struct cgroup_taskset *tset) { struct task_struct *p; struct cgroup_subsys_state *css; cgroup_taskset_for_each(p, css, tset) { void *v = (void *)(unsigned long)css->id; task_lock(p); iterate_fd(p->files, 0, update_netprio, v); task_unlock(p); } } static struct cftype ss_files[] = { { .name = "prioidx", .read_u64 = read_prioidx, }, { .name = "ifpriomap", .seq_show = read_priomap, .write = write_priomap, }, { } /* terminate */ }; struct cgroup_subsys net_prio_cgrp_subsys = { .css_alloc = cgrp_css_alloc, .css_online = cgrp_css_online, .css_free = cgrp_css_free, .attach = net_prio_attach, .legacy_cftypes = ss_files, }; static int netprio_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netprio_map *old; /* * Note this is called with rtnl_lock held so we have update side * protection on our rcu assignments */ switch (event) { case NETDEV_UNREGISTER: old = rtnl_dereference(dev->priomap); RCU_INIT_POINTER(dev->priomap, NULL); if (old) kfree_rcu(old, rcu); break; } return NOTIFY_DONE; } static struct notifier_block netprio_device_notifier = { .notifier_call = netprio_device_event }; static int __init init_cgroup_netprio(void) { register_netdevice_notifier(&netprio_device_notifier); return 0; } subsys_initcall(init_cgroup_netprio); |
| 53 53 50 50 120 108 27 27 27 4 27 4 27 27 22 27 1 55 54 54 12 55 53 53 53 48 53 49 49 53 53 49 49 49 49 4 3 3 3 4 22 21 1 2 19 1 21 21 23 1 7 4 1 1 4 17 17 16 17 17 124 2 49 27 475 87 86 343 95 56 1699 124 1695 475 1692 343 1701 400 160 56 70 15 7 17 8 3 115 17 56 56 23 17 15 56 15 15 15 56 56 97 16 15 14 15 10 10 9 9 10 10 6 1 1 1 1699 40 40 36 34 14 1 1 1 1 2 1 2 2 1 23 23 4 4 1 1 40 39 75 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 | // SPDX-License-Identifier: GPL-2.0-or-later /* * INET 802.1Q VLAN * Ethernet-type device handling. * * Authors: Ben Greear <greearb@candelatech.com> * Please send support related email to: netdev@vger.kernel.org * VLAN Home Page: http://www.candelatech.com/~greear/vlan.html * * Fixes: * Fix for packet capture - Nick Eggleston <nick@dccinc.com>; * Add HW acceleration hooks - David S. Miller <davem@redhat.com>; * Correct all the locking - David S. Miller <davem@redhat.com>; * Use hash table for VLAN groups - David S. Miller <davem@redhat.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/capability.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/rculist.h> #include <net/arp.h> #include <linux/rtnetlink.h> #include <linux/notifier.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/uaccess.h> #include <linux/if_vlan.h> #include "vlan.h" #include "vlanproc.h" #define DRV_VERSION "1.8" /* Global VLAN variables */ unsigned int vlan_net_id __read_mostly; const char vlan_fullname[] = "802.1Q VLAN Support"; const char vlan_version[] = DRV_VERSION; /* End of global variables definitions. */ static int vlan_group_prealloc_vid(struct vlan_group *vg, __be16 vlan_proto, u16 vlan_id) { struct net_device **array; unsigned int vidx; unsigned int size; int pidx; ASSERT_RTNL(); pidx = vlan_proto_idx(vlan_proto); if (pidx < 0) return -EINVAL; vidx = vlan_id / VLAN_GROUP_ARRAY_PART_LEN; array = vg->vlan_devices_arrays[pidx][vidx]; if (array != NULL) return 0; size = sizeof(struct net_device *) * VLAN_GROUP_ARRAY_PART_LEN; array = kzalloc(size, GFP_KERNEL_ACCOUNT); if (array == NULL) return -ENOBUFS; /* paired with smp_rmb() in __vlan_group_get_device() */ smp_wmb(); vg->vlan_devices_arrays[pidx][vidx] = array; return 0; } static void vlan_stacked_transfer_operstate(const struct net_device *rootdev, struct net_device *dev, struct vlan_dev_priv *vlan) { if (!(vlan->flags & VLAN_FLAG_BRIDGE_BINDING)) netif_stacked_transfer_operstate(rootdev, dev); } void unregister_vlan_dev(struct net_device *dev, struct list_head *head) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; struct vlan_info *vlan_info; struct vlan_group *grp; u16 vlan_id = vlan->vlan_id; ASSERT_RTNL(); vlan_info = rtnl_dereference(real_dev->vlan_info); BUG_ON(!vlan_info); grp = &vlan_info->grp; grp->nr_vlan_devs--; if (vlan->flags & VLAN_FLAG_MVRP) vlan_mvrp_request_leave(dev); if (vlan->flags & VLAN_FLAG_GVRP) vlan_gvrp_request_leave(dev); vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, NULL); netdev_upper_dev_unlink(real_dev, dev); /* Because unregister_netdevice_queue() makes sure at least one rcu * grace period is respected before device freeing, * we dont need to call synchronize_net() here. */ unregister_netdevice_queue(dev, head); if (grp->nr_vlan_devs == 0) { vlan_mvrp_uninit_applicant(real_dev); vlan_gvrp_uninit_applicant(real_dev); } vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id); } int vlan_check_real_dev(struct net_device *real_dev, __be16 protocol, u16 vlan_id, struct netlink_ext_ack *extack) { const char *name = real_dev->name; if (real_dev->features & NETIF_F_VLAN_CHALLENGED || real_dev->type != ARPHRD_ETHER) { pr_info("VLANs not supported on %s\n", name); NL_SET_ERR_MSG_MOD(extack, "VLANs not supported on device"); return -EOPNOTSUPP; } if (vlan_find_dev(real_dev, protocol, vlan_id) != NULL) { NL_SET_ERR_MSG_MOD(extack, "VLAN device already exists"); return -EEXIST; } return 0; } int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; u16 vlan_id = vlan->vlan_id; struct vlan_info *vlan_info; struct vlan_group *grp; int err; err = vlan_vid_add(real_dev, vlan->vlan_proto, vlan_id); if (err) return err; vlan_info = rtnl_dereference(real_dev->vlan_info); /* vlan_info should be there now. vlan_vid_add took care of it */ BUG_ON(!vlan_info); grp = &vlan_info->grp; if (grp->nr_vlan_devs == 0) { err = vlan_gvrp_init_applicant(real_dev); if (err < 0) goto out_vid_del; err = vlan_mvrp_init_applicant(real_dev); if (err < 0) goto out_uninit_gvrp; } err = vlan_group_prealloc_vid(grp, vlan->vlan_proto, vlan_id); if (err < 0) goto out_uninit_mvrp; err = register_netdevice(dev); if (err < 0) goto out_uninit_mvrp; err = netdev_upper_dev_link(real_dev, dev, extack); if (err) goto out_unregister_netdev; vlan_stacked_transfer_operstate(real_dev, dev, vlan); linkwatch_fire_event(dev); /* _MUST_ call rfc2863_policy() */ /* So, got the sucker initialized, now lets place * it into our local structure. */ vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, dev); grp->nr_vlan_devs++; netdev_update_features(dev); return 0; out_unregister_netdev: unregister_netdevice(dev); out_uninit_mvrp: if (grp->nr_vlan_devs == 0) vlan_mvrp_uninit_applicant(real_dev); out_uninit_gvrp: if (grp->nr_vlan_devs == 0) vlan_gvrp_uninit_applicant(real_dev); out_vid_del: vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id); return err; } /* Attach a VLAN device to a mac address (ie Ethernet Card). * Returns 0 if the device was created or a negative error code otherwise. */ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) { struct net_device *new_dev; struct vlan_dev_priv *vlan; struct net *net = dev_net(real_dev); struct vlan_net *vn = net_generic(net, vlan_net_id); char name[IFNAMSIZ]; int err; if (vlan_id >= VLAN_VID_MASK) return -ERANGE; err = vlan_check_real_dev(real_dev, htons(ETH_P_8021Q), vlan_id, NULL); if (err < 0) return err; /* Gotta set up the fields for the device. */ switch (vn->name_type) { case VLAN_NAME_TYPE_RAW_PLUS_VID: /* name will look like: eth1.0005 */ snprintf(name, IFNAMSIZ, "%s.%.4i", real_dev->name, vlan_id); break; case VLAN_NAME_TYPE_PLUS_VID_NO_PAD: /* Put our vlan.VID in the name. * Name will look like: vlan5 */ snprintf(name, IFNAMSIZ, "vlan%i", vlan_id); break; case VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD: /* Put our vlan.VID in the name. * Name will look like: eth0.5 */ snprintf(name, IFNAMSIZ, "%s.%i", real_dev->name, vlan_id); break; case VLAN_NAME_TYPE_PLUS_VID: /* Put our vlan.VID in the name. * Name will look like: vlan0005 */ default: snprintf(name, IFNAMSIZ, "vlan%.4i", vlan_id); } new_dev = alloc_netdev(sizeof(struct vlan_dev_priv), name, NET_NAME_UNKNOWN, vlan_setup); if (new_dev == NULL) return -ENOBUFS; dev_net_set(new_dev, net); /* need 4 bytes for extra VLAN header info, * hope the underlying device can handle it. */ new_dev->mtu = real_dev->mtu; vlan = vlan_dev_priv(new_dev); vlan->vlan_proto = htons(ETH_P_8021Q); vlan->vlan_id = vlan_id; vlan->real_dev = real_dev; vlan->dent = NULL; vlan->flags = VLAN_FLAG_REORDER_HDR; new_dev->rtnl_link_ops = &vlan_link_ops; err = register_vlan_dev(new_dev, NULL); if (err < 0) goto out_free_newdev; return 0; out_free_newdev: free_netdev(new_dev); return err; } static void vlan_sync_address(struct net_device *dev, struct net_device *vlandev) { struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); /* May be called without an actual change */ if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr)) return; /* vlan continues to inherit address of lower device */ if (vlan_dev_inherit_address(vlandev, dev)) goto out; /* vlan address was different from the old address and is equal to * the new address */ if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) dev_uc_del(dev, vlandev->dev_addr); /* vlan address was equal to the old address and is different from * the new address */ if (ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && !ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) dev_uc_add(dev, vlandev->dev_addr); out: ether_addr_copy(vlan->real_dev_addr, dev->dev_addr); } static void vlan_transfer_features(struct net_device *dev, struct net_device *vlandev) { struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); netif_inherit_tso_max(vlandev, dev); if (vlan_hw_offload_capable(dev->features, vlan->vlan_proto)) vlandev->hard_header_len = dev->hard_header_len; else vlandev->hard_header_len = dev->hard_header_len + VLAN_HLEN; #if IS_ENABLED(CONFIG_FCOE) vlandev->fcoe_ddp_xid = dev->fcoe_ddp_xid; #endif vlandev->priv_flags &= ~IFF_XMIT_DST_RELEASE; vlandev->priv_flags |= (vlan->real_dev->priv_flags & IFF_XMIT_DST_RELEASE); vlandev->hw_enc_features = vlan_tnl_features(vlan->real_dev); netdev_update_features(vlandev); } static int __vlan_device_event(struct net_device *dev, unsigned long event) { int err = 0; switch (event) { case NETDEV_CHANGENAME: vlan_proc_rem_dev(dev); err = vlan_proc_add_dev(dev); break; case NETDEV_REGISTER: err = vlan_proc_add_dev(dev); break; case NETDEV_UNREGISTER: vlan_proc_rem_dev(dev); break; } return err; } static void vlan_vid0_add(struct net_device *dev) { struct vlan_info *vlan_info; int err; if (!(dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) return; pr_info("adding VLAN 0 to HW filter on device %s\n", dev->name); err = vlan_vid_add(dev, htons(ETH_P_8021Q), 0); if (err) return; vlan_info = rtnl_dereference(dev->vlan_info); vlan_info->auto_vid0 = true; } static void vlan_vid0_del(struct net_device *dev) { struct vlan_info *vlan_info = rtnl_dereference(dev->vlan_info); if (!vlan_info || !vlan_info->auto_vid0) return; vlan_info->auto_vid0 = false; vlan_vid_del(dev, htons(ETH_P_8021Q), 0); } static int vlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr); struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct vlan_group *grp; struct vlan_info *vlan_info; int i, flgs; struct net_device *vlandev; struct vlan_dev_priv *vlan; bool last = false; LIST_HEAD(list); int err; if (is_vlan_dev(dev)) { int err = __vlan_device_event(dev, event); if (err) return notifier_from_errno(err); } if (event == NETDEV_UP) vlan_vid0_add(dev); else if (event == NETDEV_DOWN) vlan_vid0_del(dev); vlan_info = rtnl_dereference(dev->vlan_info); if (!vlan_info) goto out; grp = &vlan_info->grp; /* It is OK that we do not hold the group lock right now, * as we run under the RTNL lock. */ switch (event) { case NETDEV_CHANGE: /* Propagate real device state to vlan devices */ vlan_group_for_each_dev(grp, i, vlandev) vlan_stacked_transfer_operstate(dev, vlandev, vlan_dev_priv(vlandev)); break; case NETDEV_CHANGEADDR: /* Adjust unicast filters on underlying device */ vlan_group_for_each_dev(grp, i, vlandev) { flgs = vlandev->flags; if (!(flgs & IFF_UP)) continue; vlan_sync_address(dev, vlandev); } break; case NETDEV_CHANGEMTU: vlan_group_for_each_dev(grp, i, vlandev) { if (vlandev->mtu <= dev->mtu) continue; dev_set_mtu(vlandev, dev->mtu); } break; case NETDEV_FEAT_CHANGE: /* Propagate device features to underlying device */ vlan_group_for_each_dev(grp, i, vlandev) vlan_transfer_features(dev, vlandev); break; case NETDEV_DOWN: { struct net_device *tmp; LIST_HEAD(close_list); /* Put all VLANs for this dev in the down state too. */ vlan_group_for_each_dev(grp, i, vlandev) { flgs = vlandev->flags; if (!(flgs & IFF_UP)) continue; vlan = vlan_dev_priv(vlandev); if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) list_add(&vlandev->close_list, &close_list); } netif_close_many(&close_list, false); list_for_each_entry_safe(vlandev, tmp, &close_list, close_list) { vlan_stacked_transfer_operstate(dev, vlandev, vlan_dev_priv(vlandev)); list_del_init(&vlandev->close_list); } list_del(&close_list); break; } case NETDEV_UP: /* Put all VLANs for this dev in the up state too. */ vlan_group_for_each_dev(grp, i, vlandev) { flgs = netif_get_flags(vlandev); if (flgs & IFF_UP) continue; vlan = vlan_dev_priv(vlandev); if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) dev_change_flags(vlandev, flgs | IFF_UP, extack); vlan_stacked_transfer_operstate(dev, vlandev, vlan); } break; case NETDEV_UNREGISTER: /* twiddle thumbs on netns device moves */ if (dev->reg_state != NETREG_UNREGISTERING) break; vlan_group_for_each_dev(grp, i, vlandev) { /* removal of last vid destroys vlan_info, abort * afterwards */ if (vlan_info->nr_vids == 1) last = true; unregister_vlan_dev(vlandev, &list); if (last) break; } unregister_netdevice_many(&list); break; case NETDEV_PRE_TYPE_CHANGE: /* Forbid underlaying device to change its type. */ if (vlan_uses_dev(dev)) return NOTIFY_BAD; break; case NETDEV_NOTIFY_PEERS: case NETDEV_BONDING_FAILOVER: case NETDEV_RESEND_IGMP: /* Propagate to vlan devices */ vlan_group_for_each_dev(grp, i, vlandev) call_netdevice_notifiers(event, vlandev); break; case NETDEV_CVLAN_FILTER_PUSH_INFO: err = vlan_filter_push_vids(vlan_info, htons(ETH_P_8021Q)); if (err) return notifier_from_errno(err); break; case NETDEV_CVLAN_FILTER_DROP_INFO: vlan_filter_drop_vids(vlan_info, htons(ETH_P_8021Q)); break; case NETDEV_SVLAN_FILTER_PUSH_INFO: err = vlan_filter_push_vids(vlan_info, htons(ETH_P_8021AD)); if (err) return notifier_from_errno(err); break; case NETDEV_SVLAN_FILTER_DROP_INFO: vlan_filter_drop_vids(vlan_info, htons(ETH_P_8021AD)); break; } out: return NOTIFY_DONE; } static struct notifier_block vlan_notifier_block __read_mostly = { .notifier_call = vlan_device_event, }; /* * VLAN IOCTL handler. * o execute requested action or pass command to the device driver * arg is really a struct vlan_ioctl_args __user *. */ static int vlan_ioctl_handler(struct net *net, void __user *arg) { int err; struct vlan_ioctl_args args; struct net_device *dev = NULL; if (copy_from_user(&args, arg, sizeof(struct vlan_ioctl_args))) return -EFAULT; /* Null terminate this sucker, just in case. */ args.device1[sizeof(args.device1) - 1] = 0; args.u.device2[sizeof(args.u.device2) - 1] = 0; rtnl_lock(); switch (args.cmd) { case SET_VLAN_INGRESS_PRIORITY_CMD: case SET_VLAN_EGRESS_PRIORITY_CMD: case SET_VLAN_FLAG_CMD: case ADD_VLAN_CMD: case DEL_VLAN_CMD: case GET_VLAN_REALDEV_NAME_CMD: case GET_VLAN_VID_CMD: err = -ENODEV; dev = __dev_get_by_name(net, args.device1); if (!dev) goto out; err = -EINVAL; if (args.cmd != ADD_VLAN_CMD && !is_vlan_dev(dev)) goto out; } switch (args.cmd) { case SET_VLAN_INGRESS_PRIORITY_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; vlan_dev_set_ingress_priority(dev, args.u.skb_priority, args.vlan_qos); err = 0; break; case SET_VLAN_EGRESS_PRIORITY_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = vlan_dev_set_egress_priority(dev, args.u.skb_priority, args.vlan_qos); break; case SET_VLAN_FLAG_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = vlan_dev_change_flags(dev, args.vlan_qos ? args.u.flag : 0, args.u.flag); break; case SET_VLAN_NAME_TYPE_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; if (args.u.name_type < VLAN_NAME_TYPE_HIGHEST) { struct vlan_net *vn; vn = net_generic(net, vlan_net_id); vn->name_type = args.u.name_type; err = 0; } else { err = -EINVAL; } break; case ADD_VLAN_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = register_vlan_device(dev, args.u.VID); break; case DEL_VLAN_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; unregister_vlan_dev(dev, NULL); err = 0; break; case GET_VLAN_REALDEV_NAME_CMD: err = 0; vlan_dev_get_realdev_name(dev, args.u.device2, sizeof(args.u.device2)); if (copy_to_user(arg, &args, sizeof(struct vlan_ioctl_args))) err = -EFAULT; break; case GET_VLAN_VID_CMD: err = 0; args.u.VID = vlan_dev_vlan_id(dev); if (copy_to_user(arg, &args, sizeof(struct vlan_ioctl_args))) err = -EFAULT; break; default: err = -EOPNOTSUPP; break; } out: rtnl_unlock(); return err; } static int __net_init vlan_init_net(struct net *net) { struct vlan_net *vn = net_generic(net, vlan_net_id); int err; vn->name_type = VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD; err = vlan_proc_init(net); return err; } static void __net_exit vlan_exit_net(struct net *net) { vlan_proc_cleanup(net); } static struct pernet_operations vlan_net_ops = { .init = vlan_init_net, .exit = vlan_exit_net, .id = &vlan_net_id, .size = sizeof(struct vlan_net), }; static int __init vlan_proto_init(void) { int err; pr_info("%s v%s\n", vlan_fullname, vlan_version); err = register_pernet_subsys(&vlan_net_ops); if (err < 0) goto err0; err = register_netdevice_notifier(&vlan_notifier_block); if (err < 0) goto err2; err = vlan_gvrp_init(); if (err < 0) goto err3; err = vlan_mvrp_init(); if (err < 0) goto err4; err = vlan_netlink_init(); if (err < 0) goto err5; vlan_ioctl_set(vlan_ioctl_handler); return 0; err5: vlan_mvrp_uninit(); err4: vlan_gvrp_uninit(); err3: unregister_netdevice_notifier(&vlan_notifier_block); err2: unregister_pernet_subsys(&vlan_net_ops); err0: return err; } static void __exit vlan_cleanup_module(void) { vlan_ioctl_set(NULL); vlan_netlink_fini(); unregister_netdevice_notifier(&vlan_notifier_block); unregister_pernet_subsys(&vlan_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ vlan_mvrp_uninit(); vlan_gvrp_uninit(); } module_init(vlan_proto_init); module_exit(vlan_cleanup_module); MODULE_DESCRIPTION("802.1Q/802.1ad VLAN Protocol"); MODULE_LICENSE("GPL"); MODULE_VERSION(DRV_VERSION); MODULE_IMPORT_NS("NETDEV_INTERNAL"); |
| 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Nano River Technologies viperboard i2c controller driver * * (C) 2012 by Lemonage GmbH * Author: Lars Poeschel <poeschel@lemonage.de> * All rights reserved. */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/string_choices.h> #include <linux/types.h> #include <linux/mutex.h> #include <linux/platform_device.h> #include <linux/usb.h> #include <linux/i2c.h> #include <linux/mfd/viperboard.h> struct vprbrd_i2c { struct i2c_adapter i2c; u8 bus_freq_param; }; /* i2c bus frequency module parameter */ static u8 i2c_bus_param; static unsigned int i2c_bus_freq = 100; module_param(i2c_bus_freq, int, 0); MODULE_PARM_DESC(i2c_bus_freq, "i2c bus frequency in khz (default is 100) valid values: 10, 100, 200, 400, 1000, 3000, 6000"); static int vprbrd_i2c_status(struct i2c_adapter *i2c, struct vprbrd_i2c_status *status, bool prev_error) { u16 bytes_xfer; int ret; struct vprbrd *vb = (struct vprbrd *)i2c->algo_data; /* check for protocol error */ bytes_xfer = sizeof(struct vprbrd_i2c_status); ret = usb_control_msg(vb->usb_dev, usb_rcvctrlpipe(vb->usb_dev, 0), VPRBRD_USB_REQUEST_I2C, VPRBRD_USB_TYPE_IN, 0x0000, 0x0000, status, bytes_xfer, VPRBRD_USB_TIMEOUT_MS); if (ret != bytes_xfer) prev_error = true; if (prev_error) { dev_err(&i2c->dev, "failure in usb communication\n"); return -EREMOTEIO; } dev_dbg(&i2c->dev, " status = %d\n", status->status); if (status->status != 0x00) { dev_err(&i2c->dev, "failure: i2c protocol error\n"); return -EPROTO; } return 0; } static int vprbrd_i2c_receive(struct usb_device *usb_dev, struct vprbrd_i2c_read_msg *rmsg, int bytes_xfer) { int ret, bytes_actual; int error = 0; /* send the read request */ ret = usb_bulk_msg(usb_dev, usb_sndbulkpipe(usb_dev, VPRBRD_EP_OUT), rmsg, sizeof(struct vprbrd_i2c_read_hdr), &bytes_actual, VPRBRD_USB_TIMEOUT_MS); if ((ret < 0) || (bytes_actual != sizeof(struct vprbrd_i2c_read_hdr))) { dev_err(&usb_dev->dev, "failure transmitting usb\n"); error = -EREMOTEIO; } /* read the actual data */ ret = usb_bulk_msg(usb_dev, usb_rcvbulkpipe(usb_dev, VPRBRD_EP_IN), rmsg, bytes_xfer, &bytes_actual, VPRBRD_USB_TIMEOUT_MS); if ((ret < 0) || (bytes_xfer != bytes_actual)) { dev_err(&usb_dev->dev, "failure receiving usb\n"); error = -EREMOTEIO; } return error; } static int vprbrd_i2c_addr(struct usb_device *usb_dev, struct vprbrd_i2c_addr_msg *amsg) { int ret, bytes_actual; ret = usb_bulk_msg(usb_dev, usb_sndbulkpipe(usb_dev, VPRBRD_EP_OUT), amsg, sizeof(struct vprbrd_i2c_addr_msg), &bytes_actual, VPRBRD_USB_TIMEOUT_MS); if ((ret < 0) || (sizeof(struct vprbrd_i2c_addr_msg) != bytes_actual)) { dev_err(&usb_dev->dev, "failure transmitting usb\n"); return -EREMOTEIO; } return 0; } static int vprbrd_i2c_read(struct vprbrd *vb, struct i2c_msg *msg) { int ret; u16 remain_len, len1, len2, start = 0x0000; struct vprbrd_i2c_read_msg *rmsg = (struct vprbrd_i2c_read_msg *)vb->buf; remain_len = msg->len; rmsg->header.cmd = VPRBRD_I2C_CMD_READ; while (remain_len > 0) { rmsg->header.addr = cpu_to_le16(start + 0x4000); if (remain_len <= 255) { len1 = remain_len; len2 = 0x00; rmsg->header.len0 = remain_len; rmsg->header.len1 = 0x00; rmsg->header.len2 = 0x00; rmsg->header.len3 = 0x00; rmsg->header.len4 = 0x00; rmsg->header.len5 = 0x00; remain_len = 0; } else if (remain_len <= 510) { len1 = remain_len; len2 = 0x00; rmsg->header.len0 = remain_len - 255; rmsg->header.len1 = 0xff; rmsg->header.len2 = 0x00; rmsg->header.len3 = 0x00; rmsg->header.len4 = 0x00; rmsg->header.len5 = 0x00; remain_len = 0; } else if (remain_len <= 512) { len1 = remain_len; len2 = 0x00; rmsg->header.len0 = remain_len - 510; rmsg->header.len1 = 0xff; rmsg->header.len2 = 0xff; rmsg->header.len3 = 0x00; rmsg->header.len4 = 0x00; rmsg->header.len5 = 0x00; remain_len = 0; } else if (remain_len <= 767) { len1 = 512; len2 = remain_len - 512; rmsg->header.len0 = 0x02; rmsg->header.len1 = 0xff; rmsg->header.len2 = 0xff; rmsg->header.len3 = remain_len - 512; rmsg->header.len4 = 0x00; rmsg->header.len5 = 0x00; remain_len = 0; } else if (remain_len <= 1022) { len1 = 512; len2 = remain_len - 512; rmsg->header.len0 = 0x02; rmsg->header.len1 = 0xff; rmsg->header.len2 = 0xff; rmsg->header.len3 = remain_len - 767; rmsg->header.len4 = 0xff; rmsg->header.len5 = 0x00; remain_len = 0; } else if (remain_len <= 1024) { len1 = 512; len2 = remain_len - 512; rmsg->header.len0 = 0x02; rmsg->header.len1 = 0xff; rmsg->header.len2 = 0xff; rmsg->header.len3 = remain_len - 1022; rmsg->header.len4 = 0xff; rmsg->header.len5 = 0xff; remain_len = 0; } else { len1 = 512; len2 = 512; rmsg->header.len0 = 0x02; rmsg->header.len1 = 0xff; rmsg->header.len2 = 0xff; rmsg->header.len3 = 0x02; rmsg->header.len4 = 0xff; rmsg->header.len5 = 0xff; remain_len -= 1024; start += 1024; } rmsg->header.tf1 = cpu_to_le16(len1); rmsg->header.tf2 = cpu_to_le16(len2); /* first read transfer */ ret = vprbrd_i2c_receive(vb->usb_dev, rmsg, len1); if (ret < 0) return ret; /* copy the received data */ memcpy(msg->buf + start, rmsg, len1); /* second read transfer if necessary */ if (len2 > 0) { ret = vprbrd_i2c_receive(vb->usb_dev, rmsg, len2); if (ret < 0) return ret; /* copy the received data */ memcpy(msg->buf + start + 512, rmsg, len2); } } return 0; } static int vprbrd_i2c_write(struct vprbrd *vb, struct i2c_msg *msg) { int ret, bytes_actual; u16 remain_len, bytes_xfer, start = 0x0000; struct vprbrd_i2c_write_msg *wmsg = (struct vprbrd_i2c_write_msg *)vb->buf; remain_len = msg->len; wmsg->header.cmd = VPRBRD_I2C_CMD_WRITE; wmsg->header.last = 0x00; wmsg->header.chan = 0x00; wmsg->header.spi = 0x0000; while (remain_len > 0) { wmsg->header.addr = cpu_to_le16(start + 0x4000); if (remain_len > 503) { wmsg->header.len1 = 0xff; wmsg->header.len2 = 0xf8; remain_len -= 503; bytes_xfer = 503 + sizeof(struct vprbrd_i2c_write_hdr); start += 503; } else if (remain_len > 255) { wmsg->header.len1 = 0xff; wmsg->header.len2 = (remain_len - 255); bytes_xfer = remain_len + sizeof(struct vprbrd_i2c_write_hdr); remain_len = 0; } else { wmsg->header.len1 = remain_len; wmsg->header.len2 = 0x00; bytes_xfer = remain_len + sizeof(struct vprbrd_i2c_write_hdr); remain_len = 0; } memcpy(wmsg->data, msg->buf + start, bytes_xfer - sizeof(struct vprbrd_i2c_write_hdr)); ret = usb_bulk_msg(vb->usb_dev, usb_sndbulkpipe(vb->usb_dev, VPRBRD_EP_OUT), wmsg, bytes_xfer, &bytes_actual, VPRBRD_USB_TIMEOUT_MS); if ((ret < 0) || (bytes_xfer != bytes_actual)) return -EREMOTEIO; } return 0; } static int vprbrd_i2c_xfer(struct i2c_adapter *i2c, struct i2c_msg *msgs, int num) { struct i2c_msg *pmsg; int i, ret, error = 0; struct vprbrd *vb = (struct vprbrd *)i2c->algo_data; struct vprbrd_i2c_addr_msg *amsg = (struct vprbrd_i2c_addr_msg *)vb->buf; struct vprbrd_i2c_status *smsg = (struct vprbrd_i2c_status *)vb->buf; for (i = 0 ; i < num ; i++) { pmsg = &msgs[i]; dev_dbg(&i2c->dev, " %d: %s (flags %d) %d bytes to 0x%02x\n", i, str_read_write(pmsg->flags & I2C_M_RD), pmsg->flags, pmsg->len, pmsg->addr); mutex_lock(&vb->lock); /* directly send the message */ if (pmsg->flags & I2C_M_RD) { /* read data */ amsg->cmd = VPRBRD_I2C_CMD_ADDR; amsg->unknown2 = 0x00; amsg->unknown3 = 0x00; amsg->addr = pmsg->addr; amsg->unknown1 = 0x01; amsg->len = cpu_to_le16(pmsg->len); /* send the addr and len, we're interested to board */ ret = vprbrd_i2c_addr(vb->usb_dev, amsg); if (ret < 0) error = ret; ret = vprbrd_i2c_read(vb, pmsg); if (ret < 0) error = ret; ret = vprbrd_i2c_status(i2c, smsg, error); if (ret < 0) error = ret; /* in case of protocol error, return the error */ if (error < 0) goto error; } else { /* write data */ ret = vprbrd_i2c_write(vb, pmsg); amsg->cmd = VPRBRD_I2C_CMD_ADDR; amsg->unknown2 = 0x00; amsg->unknown3 = 0x00; amsg->addr = pmsg->addr; amsg->unknown1 = 0x00; amsg->len = cpu_to_le16(pmsg->len); /* send the addr, the data goes to to board */ ret = vprbrd_i2c_addr(vb->usb_dev, amsg); if (ret < 0) error = ret; ret = vprbrd_i2c_status(i2c, smsg, error); if (ret < 0) error = ret; if (error < 0) goto error; } mutex_unlock(&vb->lock); } return num; error: mutex_unlock(&vb->lock); return error; } static u32 vprbrd_i2c_func(struct i2c_adapter *i2c) { return I2C_FUNC_I2C | I2C_FUNC_SMBUS_EMUL; } /* This is the actual algorithm we define */ static const struct i2c_algorithm vprbrd_algorithm = { .xfer = vprbrd_i2c_xfer, .functionality = vprbrd_i2c_func, }; static const struct i2c_adapter_quirks vprbrd_quirks = { .max_read_len = 2048, .max_write_len = 2048, }; static int vprbrd_i2c_probe(struct platform_device *pdev) { struct vprbrd *vb = dev_get_drvdata(pdev->dev.parent); struct vprbrd_i2c *vb_i2c; int ret; int pipe; vb_i2c = devm_kzalloc(&pdev->dev, sizeof(*vb_i2c), GFP_KERNEL); if (vb_i2c == NULL) return -ENOMEM; /* setup i2c adapter description */ vb_i2c->i2c.owner = THIS_MODULE; vb_i2c->i2c.class = I2C_CLASS_HWMON; vb_i2c->i2c.algo = &vprbrd_algorithm; vb_i2c->i2c.quirks = &vprbrd_quirks; vb_i2c->i2c.algo_data = vb; /* save the param in usb capabable memory */ vb_i2c->bus_freq_param = i2c_bus_param; snprintf(vb_i2c->i2c.name, sizeof(vb_i2c->i2c.name), "viperboard at bus %03d device %03d", vb->usb_dev->bus->busnum, vb->usb_dev->devnum); /* setting the bus frequency */ if ((i2c_bus_param <= VPRBRD_I2C_FREQ_10KHZ) && (i2c_bus_param >= VPRBRD_I2C_FREQ_6MHZ)) { pipe = usb_sndctrlpipe(vb->usb_dev, 0); ret = usb_control_msg(vb->usb_dev, pipe, VPRBRD_USB_REQUEST_I2C_FREQ, VPRBRD_USB_TYPE_OUT, 0x0000, 0x0000, &vb_i2c->bus_freq_param, 1, VPRBRD_USB_TIMEOUT_MS); if (ret != 1) return dev_err_probe(&pdev->dev, -EIO, "failure setting i2c_bus_freq to %d\n", i2c_bus_freq); } else { return dev_err_probe(&pdev->dev, -EIO, "invalid i2c_bus_freq setting:%d\n", i2c_bus_freq); } vb_i2c->i2c.dev.parent = &pdev->dev; /* attach to i2c layer */ i2c_add_adapter(&vb_i2c->i2c); platform_set_drvdata(pdev, vb_i2c); return 0; } static void vprbrd_i2c_remove(struct platform_device *pdev) { struct vprbrd_i2c *vb_i2c = platform_get_drvdata(pdev); i2c_del_adapter(&vb_i2c->i2c); } static struct platform_driver vprbrd_i2c_driver = { .driver.name = "viperboard-i2c", .probe = vprbrd_i2c_probe, .remove = vprbrd_i2c_remove, }; static int __init vprbrd_i2c_init(void) { switch (i2c_bus_freq) { case 6000: i2c_bus_param = VPRBRD_I2C_FREQ_6MHZ; break; case 3000: i2c_bus_param = VPRBRD_I2C_FREQ_3MHZ; break; case 1000: i2c_bus_param = VPRBRD_I2C_FREQ_1MHZ; break; case 400: i2c_bus_param = VPRBRD_I2C_FREQ_400KHZ; break; case 200: i2c_bus_param = VPRBRD_I2C_FREQ_200KHZ; break; case 100: i2c_bus_param = VPRBRD_I2C_FREQ_100KHZ; break; case 10: i2c_bus_param = VPRBRD_I2C_FREQ_10KHZ; break; default: pr_warn("invalid i2c_bus_freq (%d)\n", i2c_bus_freq); i2c_bus_param = VPRBRD_I2C_FREQ_100KHZ; } return platform_driver_register(&vprbrd_i2c_driver); } subsys_initcall(vprbrd_i2c_init); static void __exit vprbrd_i2c_exit(void) { platform_driver_unregister(&vprbrd_i2c_driver); } module_exit(vprbrd_i2c_exit); MODULE_AUTHOR("Lars Poeschel <poeschel@lemonage.de>"); MODULE_DESCRIPTION("I2C controller driver for Nano River Techs Viperboard"); MODULE_LICENSE("GPL"); MODULE_ALIAS("platform:viperboard-i2c"); |
| 2 3 2 3 2 3 3 2 3 3 3 3 1 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 | // SPDX-License-Identifier: GPL-2.0 /* Fintek F81604 USB-to-2CAN controller driver. * * Copyright (C) 2023 Ji-Ze Hong (Peter Hong) <peter_hong@fintek.com.tw> */ #include <linux/bitfield.h> #include <linux/netdevice.h> #include <linux/units.h> #include <linux/usb.h> #include <linux/can.h> #include <linux/can/dev.h> #include <linux/can/error.h> #include <linux/can/platform/sja1000.h> #include <linux/unaligned.h> /* vendor and product id */ #define F81604_VENDOR_ID 0x2c42 #define F81604_PRODUCT_ID 0x1709 #define F81604_CAN_CLOCK (12 * MEGA) #define F81604_MAX_DEV 2 #define F81604_SET_DEVICE_RETRY 10 #define F81604_USB_TIMEOUT 2000 #define F81604_SET_GET_REGISTER 0xA0 #define F81604_PORT_OFFSET 0x1000 #define F81604_MAX_RX_URBS 4 #define F81604_CMD_DATA 0x00 #define F81604_DLC_LEN_MASK GENMASK(3, 0) #define F81604_DLC_EFF_BIT BIT(7) #define F81604_DLC_RTR_BIT BIT(6) #define F81604_SFF_SHIFT 5 #define F81604_EFF_SHIFT 3 #define F81604_BRP_MASK GENMASK(5, 0) #define F81604_SJW_MASK GENMASK(7, 6) #define F81604_SEG1_MASK GENMASK(3, 0) #define F81604_SEG2_MASK GENMASK(6, 4) #define F81604_CLEAR_ALC 0 #define F81604_CLEAR_ECC 1 #define F81604_CLEAR_OVERRUN 2 /* device setting */ #define F81604_CTRL_MODE_REG 0x80 #define F81604_TX_ONESHOT (0x03 << 3) #define F81604_TX_NORMAL (0x01 << 3) #define F81604_RX_AUTO_RELEASE_BUF BIT(1) #define F81604_INT_WHEN_CHANGE BIT(0) #define F81604_TERMINATOR_REG 0x105 #define F81604_CAN0_TERM BIT(2) #define F81604_CAN1_TERM BIT(3) #define F81604_TERMINATION_DISABLED CAN_TERMINATION_DISABLED #define F81604_TERMINATION_ENABLED 120 /* SJA1000 registers - manual section 6.4 (Pelican Mode) */ #define F81604_SJA1000_MOD 0x00 #define F81604_SJA1000_CMR 0x01 #define F81604_SJA1000_IR 0x03 #define F81604_SJA1000_IER 0x04 #define F81604_SJA1000_ALC 0x0B #define F81604_SJA1000_ECC 0x0C #define F81604_SJA1000_RXERR 0x0E #define F81604_SJA1000_TXERR 0x0F #define F81604_SJA1000_ACCC0 0x10 #define F81604_SJA1000_ACCM0 0x14 #define F81604_MAX_FILTER_CNT 4 /* Common registers - manual section 6.5 */ #define F81604_SJA1000_BTR0 0x06 #define F81604_SJA1000_BTR1 0x07 #define F81604_SJA1000_BTR1_SAMPLE_TRIPLE BIT(7) #define F81604_SJA1000_OCR 0x08 #define F81604_SJA1000_CDR 0x1F /* mode register */ #define F81604_SJA1000_MOD_RM 0x01 #define F81604_SJA1000_MOD_LOM 0x02 #define F81604_SJA1000_MOD_STM 0x04 /* commands */ #define F81604_SJA1000_CMD_CDO 0x08 /* interrupt sources */ #define F81604_SJA1000_IRQ_BEI 0x80 #define F81604_SJA1000_IRQ_ALI 0x40 #define F81604_SJA1000_IRQ_EPI 0x20 #define F81604_SJA1000_IRQ_DOI 0x08 #define F81604_SJA1000_IRQ_EI 0x04 #define F81604_SJA1000_IRQ_TI 0x02 #define F81604_SJA1000_IRQ_RI 0x01 #define F81604_SJA1000_IRQ_ALL 0xFF #define F81604_SJA1000_IRQ_OFF 0x00 /* status register content */ #define F81604_SJA1000_SR_BS 0x80 #define F81604_SJA1000_SR_ES 0x40 #define F81604_SJA1000_SR_TCS 0x08 /* ECC register */ #define F81604_SJA1000_ECC_SEG 0x1F #define F81604_SJA1000_ECC_DIR 0x20 #define F81604_SJA1000_ECC_BIT 0x00 #define F81604_SJA1000_ECC_FORM 0x40 #define F81604_SJA1000_ECC_STUFF 0x80 #define F81604_SJA1000_ECC_MASK 0xc0 /* ALC register */ #define F81604_SJA1000_ALC_MASK 0x1f /* table of devices that work with this driver */ static const struct usb_device_id f81604_table[] = { { USB_DEVICE(F81604_VENDOR_ID, F81604_PRODUCT_ID) }, {} /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, f81604_table); static const struct ethtool_ops f81604_ethtool_ops = { .get_ts_info = ethtool_op_get_ts_info, }; static const u16 f81604_termination[] = { F81604_TERMINATION_DISABLED, F81604_TERMINATION_ENABLED }; struct f81604_priv { struct net_device *netdev[F81604_MAX_DEV]; }; struct f81604_port_priv { struct can_priv can; struct net_device *netdev; struct sk_buff *echo_skb; unsigned long clear_flags; struct work_struct clear_reg_work; struct usb_device *dev; struct usb_interface *intf; struct usb_anchor urbs_anchor; }; /* Interrupt endpoint data format: * Byte 0: Status register. * Byte 1: Interrupt register. * Byte 2: Interrupt enable register. * Byte 3: Arbitration lost capture(ALC) register. * Byte 4: Error code capture(ECC) register. * Byte 5: Error warning limit register. * Byte 6: RX error counter register. * Byte 7: TX error counter register. * Byte 8: Reserved. */ struct f81604_int_data { u8 sr; u8 isrc; u8 ier; u8 alc; u8 ecc; u8 ewlr; u8 rxerr; u8 txerr; u8 val; } __packed __aligned(4); struct f81604_sff { __be16 id; u8 data[CAN_MAX_DLEN]; } __packed __aligned(2); struct f81604_eff { __be32 id; u8 data[CAN_MAX_DLEN]; } __packed __aligned(2); struct f81604_can_frame { u8 cmd; /* According for F81604 DLC define: * bit 3~0: data length (0~8) * bit6: is RTR flag. * bit7: is EFF frame. */ u8 dlc; union { struct f81604_sff sff; struct f81604_eff eff; }; } __packed __aligned(2); static const u8 bulk_in_addr[F81604_MAX_DEV] = { 2, 4 }; static const u8 bulk_out_addr[F81604_MAX_DEV] = { 1, 3 }; static const u8 int_in_addr[F81604_MAX_DEV] = { 1, 3 }; static int f81604_write(struct usb_device *dev, u16 reg, u8 data) { int ret; ret = usb_control_msg_send(dev, 0, F81604_SET_GET_REGISTER, USB_TYPE_VENDOR | USB_DIR_OUT, 0, reg, &data, sizeof(data), F81604_USB_TIMEOUT, GFP_KERNEL); if (ret) dev_err(&dev->dev, "%s: reg: %x data: %x failed: %pe\n", __func__, reg, data, ERR_PTR(ret)); return ret; } static int f81604_read(struct usb_device *dev, u16 reg, u8 *data) { int ret; ret = usb_control_msg_recv(dev, 0, F81604_SET_GET_REGISTER, USB_TYPE_VENDOR | USB_DIR_IN, 0, reg, data, sizeof(*data), F81604_USB_TIMEOUT, GFP_KERNEL); if (ret < 0) dev_err(&dev->dev, "%s: reg: %x failed: %pe\n", __func__, reg, ERR_PTR(ret)); return ret; } static int f81604_update_bits(struct usb_device *dev, u16 reg, u8 mask, u8 data) { int ret; u8 tmp; ret = f81604_read(dev, reg, &tmp); if (ret) return ret; tmp &= ~mask; tmp |= (mask & data); return f81604_write(dev, reg, tmp); } static int f81604_sja1000_write(struct f81604_port_priv *priv, u16 reg, u8 data) { int port = priv->netdev->dev_port; int real_reg; real_reg = reg + F81604_PORT_OFFSET * port + F81604_PORT_OFFSET; return f81604_write(priv->dev, real_reg, data); } static int f81604_sja1000_read(struct f81604_port_priv *priv, u16 reg, u8 *data) { int port = priv->netdev->dev_port; int real_reg; real_reg = reg + F81604_PORT_OFFSET * port + F81604_PORT_OFFSET; return f81604_read(priv->dev, real_reg, data); } static int f81604_set_reset_mode(struct f81604_port_priv *priv) { int ret, i; u8 tmp; /* disable interrupts */ ret = f81604_sja1000_write(priv, F81604_SJA1000_IER, F81604_SJA1000_IRQ_OFF); if (ret) return ret; for (i = 0; i < F81604_SET_DEVICE_RETRY; i++) { ret = f81604_sja1000_read(priv, F81604_SJA1000_MOD, &tmp); if (ret) return ret; /* check reset bit */ if (tmp & F81604_SJA1000_MOD_RM) { priv->can.state = CAN_STATE_STOPPED; return 0; } /* reset chip */ ret = f81604_sja1000_write(priv, F81604_SJA1000_MOD, F81604_SJA1000_MOD_RM); if (ret) return ret; } return -EPERM; } static int f81604_set_normal_mode(struct f81604_port_priv *priv) { u8 tmp, ier = 0; u8 mod_reg = 0; int ret, i; for (i = 0; i < F81604_SET_DEVICE_RETRY; i++) { ret = f81604_sja1000_read(priv, F81604_SJA1000_MOD, &tmp); if (ret) return ret; /* check reset bit */ if ((tmp & F81604_SJA1000_MOD_RM) == 0) { priv->can.state = CAN_STATE_ERROR_ACTIVE; /* enable interrupts, RI handled by bulk-in */ ier = F81604_SJA1000_IRQ_ALL & ~F81604_SJA1000_IRQ_RI; if (!(priv->can.ctrlmode & CAN_CTRLMODE_BERR_REPORTING)) ier &= ~F81604_SJA1000_IRQ_BEI; return f81604_sja1000_write(priv, F81604_SJA1000_IER, ier); } /* set chip to normal mode */ if (priv->can.ctrlmode & CAN_CTRLMODE_LISTENONLY) mod_reg |= F81604_SJA1000_MOD_LOM; if (priv->can.ctrlmode & CAN_CTRLMODE_PRESUME_ACK) mod_reg |= F81604_SJA1000_MOD_STM; ret = f81604_sja1000_write(priv, F81604_SJA1000_MOD, mod_reg); if (ret) return ret; } return -EPERM; } static int f81604_chipset_init(struct f81604_port_priv *priv) { int i, ret; /* set clock divider and output control register */ ret = f81604_sja1000_write(priv, F81604_SJA1000_CDR, CDR_CBP | CDR_PELICAN); if (ret) return ret; /* set acceptance filter (accept all) */ for (i = 0; i < F81604_MAX_FILTER_CNT; ++i) { ret = f81604_sja1000_write(priv, F81604_SJA1000_ACCC0 + i, 0); if (ret) return ret; } for (i = 0; i < F81604_MAX_FILTER_CNT; ++i) { ret = f81604_sja1000_write(priv, F81604_SJA1000_ACCM0 + i, 0xFF); if (ret) return ret; } return f81604_sja1000_write(priv, F81604_SJA1000_OCR, OCR_TX0_PUSHPULL | OCR_TX1_PUSHPULL | OCR_MODE_NORMAL); } static void f81604_process_rx_packet(struct net_device *netdev, struct f81604_can_frame *frame) { struct net_device_stats *stats = &netdev->stats; struct can_frame *cf; struct sk_buff *skb; if (frame->cmd != F81604_CMD_DATA) return; skb = alloc_can_skb(netdev, &cf); if (!skb) { stats->rx_dropped++; return; } cf->len = can_cc_dlc2len(frame->dlc & F81604_DLC_LEN_MASK); if (frame->dlc & F81604_DLC_EFF_BIT) { cf->can_id = get_unaligned_be32(&frame->eff.id) >> F81604_EFF_SHIFT; cf->can_id |= CAN_EFF_FLAG; if (!(frame->dlc & F81604_DLC_RTR_BIT)) memcpy(cf->data, frame->eff.data, cf->len); } else { cf->can_id = get_unaligned_be16(&frame->sff.id) >> F81604_SFF_SHIFT; if (!(frame->dlc & F81604_DLC_RTR_BIT)) memcpy(cf->data, frame->sff.data, cf->len); } if (frame->dlc & F81604_DLC_RTR_BIT) cf->can_id |= CAN_RTR_FLAG; else stats->rx_bytes += cf->len; stats->rx_packets++; netif_rx(skb); } static void f81604_read_bulk_callback(struct urb *urb) { struct f81604_can_frame *frame = urb->transfer_buffer; struct net_device *netdev = urb->context; int ret; if (!netif_device_present(netdev)) return; if (urb->status) netdev_info(netdev, "%s: URB aborted %pe\n", __func__, ERR_PTR(urb->status)); switch (urb->status) { case 0: /* success */ break; case -ENOENT: case -EPIPE: case -EPROTO: case -ESHUTDOWN: return; default: goto resubmit_urb; } if (urb->actual_length != sizeof(*frame)) { netdev_warn(netdev, "URB length %u not equal to %zu\n", urb->actual_length, sizeof(*frame)); goto resubmit_urb; } f81604_process_rx_packet(netdev, frame); resubmit_urb: ret = usb_submit_urb(urb, GFP_ATOMIC); if (ret == -ENODEV) netif_device_detach(netdev); else if (ret) netdev_err(netdev, "%s: failed to resubmit read bulk urb: %pe\n", __func__, ERR_PTR(ret)); } static void f81604_handle_tx(struct f81604_port_priv *priv, struct f81604_int_data *data) { struct net_device *netdev = priv->netdev; struct net_device_stats *stats = &netdev->stats; /* transmission buffer released */ if (priv->can.ctrlmode & CAN_CTRLMODE_ONE_SHOT && !(data->sr & F81604_SJA1000_SR_TCS)) { stats->tx_errors++; can_free_echo_skb(netdev, 0, NULL); } else { /* transmission complete */ stats->tx_bytes += can_get_echo_skb(netdev, 0, NULL); stats->tx_packets++; } netif_wake_queue(netdev); } static void f81604_handle_can_bus_errors(struct f81604_port_priv *priv, struct f81604_int_data *data) { enum can_state can_state = priv->can.state; struct net_device *netdev = priv->netdev; struct net_device_stats *stats = &netdev->stats; struct can_frame *cf; struct sk_buff *skb; /* Note: ALC/ECC will not auto clear by read here, must be cleared by * read register (via clear_reg_work). */ skb = alloc_can_err_skb(netdev, &cf); if (skb) { cf->can_id |= CAN_ERR_CNT; cf->data[6] = data->txerr; cf->data[7] = data->rxerr; } if (data->isrc & F81604_SJA1000_IRQ_DOI) { /* data overrun interrupt */ netdev_dbg(netdev, "data overrun interrupt\n"); if (skb) { cf->can_id |= CAN_ERR_CRTL; cf->data[1] = CAN_ERR_CRTL_RX_OVERFLOW; } stats->rx_over_errors++; stats->rx_errors++; set_bit(F81604_CLEAR_OVERRUN, &priv->clear_flags); } if (data->isrc & F81604_SJA1000_IRQ_EI) { /* error warning interrupt */ netdev_dbg(netdev, "error warning interrupt\n"); if (data->sr & F81604_SJA1000_SR_BS) can_state = CAN_STATE_BUS_OFF; else if (data->sr & F81604_SJA1000_SR_ES) can_state = CAN_STATE_ERROR_WARNING; else can_state = CAN_STATE_ERROR_ACTIVE; } if (data->isrc & F81604_SJA1000_IRQ_BEI) { /* bus error interrupt */ netdev_dbg(netdev, "bus error interrupt\n"); priv->can.can_stats.bus_error++; if (skb) { cf->can_id |= CAN_ERR_PROT | CAN_ERR_BUSERROR; /* set error type */ switch (data->ecc & F81604_SJA1000_ECC_MASK) { case F81604_SJA1000_ECC_BIT: cf->data[2] |= CAN_ERR_PROT_BIT; break; case F81604_SJA1000_ECC_FORM: cf->data[2] |= CAN_ERR_PROT_FORM; break; case F81604_SJA1000_ECC_STUFF: cf->data[2] |= CAN_ERR_PROT_STUFF; break; default: break; } /* set error location */ cf->data[3] = data->ecc & F81604_SJA1000_ECC_SEG; } /* Error occurred during transmission? */ if ((data->ecc & F81604_SJA1000_ECC_DIR) == 0) { stats->tx_errors++; if (skb) cf->data[2] |= CAN_ERR_PROT_TX; } else { stats->rx_errors++; } set_bit(F81604_CLEAR_ECC, &priv->clear_flags); } if (data->isrc & F81604_SJA1000_IRQ_EPI) { if (can_state == CAN_STATE_ERROR_PASSIVE) can_state = CAN_STATE_ERROR_WARNING; else can_state = CAN_STATE_ERROR_PASSIVE; /* error passive interrupt */ netdev_dbg(netdev, "error passive interrupt: %d\n", can_state); } if (data->isrc & F81604_SJA1000_IRQ_ALI) { /* arbitration lost interrupt */ netdev_dbg(netdev, "arbitration lost interrupt\n"); priv->can.can_stats.arbitration_lost++; if (skb) { cf->can_id |= CAN_ERR_LOSTARB; cf->data[0] = data->alc & F81604_SJA1000_ALC_MASK; } set_bit(F81604_CLEAR_ALC, &priv->clear_flags); } if (can_state != priv->can.state) { enum can_state tx_state, rx_state; tx_state = data->txerr >= data->rxerr ? can_state : 0; rx_state = data->txerr <= data->rxerr ? can_state : 0; can_change_state(netdev, cf, tx_state, rx_state); if (can_state == CAN_STATE_BUS_OFF) can_bus_off(netdev); } if (priv->clear_flags) schedule_work(&priv->clear_reg_work); if (skb) netif_rx(skb); } static void f81604_read_int_callback(struct urb *urb) { struct f81604_int_data *data = urb->transfer_buffer; struct net_device *netdev = urb->context; struct f81604_port_priv *priv; int ret; priv = netdev_priv(netdev); if (!netif_device_present(netdev)) return; if (urb->status) netdev_info(netdev, "%s: Int URB aborted: %pe\n", __func__, ERR_PTR(urb->status)); switch (urb->status) { case 0: /* success */ break; case -ENOENT: case -EPIPE: case -EPROTO: case -ESHUTDOWN: return; default: goto resubmit_urb; } /* handle Errors */ if (data->isrc & (F81604_SJA1000_IRQ_DOI | F81604_SJA1000_IRQ_EI | F81604_SJA1000_IRQ_BEI | F81604_SJA1000_IRQ_EPI | F81604_SJA1000_IRQ_ALI)) f81604_handle_can_bus_errors(priv, data); /* handle TX */ if (priv->can.state != CAN_STATE_BUS_OFF && (data->isrc & F81604_SJA1000_IRQ_TI)) f81604_handle_tx(priv, data); resubmit_urb: ret = usb_submit_urb(urb, GFP_ATOMIC); if (ret == -ENODEV) netif_device_detach(netdev); else if (ret) netdev_err(netdev, "%s: failed to resubmit int urb: %pe\n", __func__, ERR_PTR(ret)); } static void f81604_unregister_urbs(struct f81604_port_priv *priv) { usb_kill_anchored_urbs(&priv->urbs_anchor); } static int f81604_register_urbs(struct f81604_port_priv *priv) { struct net_device *netdev = priv->netdev; struct f81604_int_data *int_data; int id = netdev->dev_port; struct urb *int_urb; int rx_urb_cnt; int ret; for (rx_urb_cnt = 0; rx_urb_cnt < F81604_MAX_RX_URBS; ++rx_urb_cnt) { struct f81604_can_frame *frame; struct urb *rx_urb; rx_urb = usb_alloc_urb(0, GFP_KERNEL); if (!rx_urb) { ret = -ENOMEM; break; } frame = kmalloc(sizeof(*frame), GFP_KERNEL); if (!frame) { usb_free_urb(rx_urb); ret = -ENOMEM; break; } usb_fill_bulk_urb(rx_urb, priv->dev, usb_rcvbulkpipe(priv->dev, bulk_in_addr[id]), frame, sizeof(*frame), f81604_read_bulk_callback, netdev); rx_urb->transfer_flags |= URB_FREE_BUFFER; usb_anchor_urb(rx_urb, &priv->urbs_anchor); ret = usb_submit_urb(rx_urb, GFP_KERNEL); if (ret) { usb_unanchor_urb(rx_urb); usb_free_urb(rx_urb); break; } /* Drop reference, USB core will take care of freeing it */ usb_free_urb(rx_urb); } if (rx_urb_cnt == 0) { netdev_warn(netdev, "%s: submit rx urb failed: %pe\n", __func__, ERR_PTR(ret)); goto error; } int_urb = usb_alloc_urb(0, GFP_KERNEL); if (!int_urb) { ret = -ENOMEM; goto error; } int_data = kmalloc(sizeof(*int_data), GFP_KERNEL); if (!int_data) { usb_free_urb(int_urb); ret = -ENOMEM; goto error; } usb_fill_int_urb(int_urb, priv->dev, usb_rcvintpipe(priv->dev, int_in_addr[id]), int_data, sizeof(*int_data), f81604_read_int_callback, netdev, 1); int_urb->transfer_flags |= URB_FREE_BUFFER; usb_anchor_urb(int_urb, &priv->urbs_anchor); ret = usb_submit_urb(int_urb, GFP_KERNEL); if (ret) { usb_unanchor_urb(int_urb); usb_free_urb(int_urb); netdev_warn(netdev, "%s: submit int urb failed: %pe\n", __func__, ERR_PTR(ret)); goto error; } /* Drop reference, USB core will take care of freeing it */ usb_free_urb(int_urb); return 0; error: f81604_unregister_urbs(priv); return ret; } static int f81604_start(struct net_device *netdev) { struct f81604_port_priv *priv = netdev_priv(netdev); int ret; u8 mode; u8 tmp; mode = F81604_RX_AUTO_RELEASE_BUF | F81604_INT_WHEN_CHANGE; /* Set TR/AT mode */ if (priv->can.ctrlmode & CAN_CTRLMODE_ONE_SHOT) mode |= F81604_TX_ONESHOT; else mode |= F81604_TX_NORMAL; ret = f81604_sja1000_write(priv, F81604_CTRL_MODE_REG, mode); if (ret) return ret; /* set reset mode */ ret = f81604_set_reset_mode(priv); if (ret) return ret; ret = f81604_chipset_init(priv); if (ret) return ret; /* Clear error counters and error code capture */ ret = f81604_sja1000_write(priv, F81604_SJA1000_TXERR, 0); if (ret) return ret; ret = f81604_sja1000_write(priv, F81604_SJA1000_RXERR, 0); if (ret) return ret; /* Read clear for ECC/ALC/IR register */ ret = f81604_sja1000_read(priv, F81604_SJA1000_ECC, &tmp); if (ret) return ret; ret = f81604_sja1000_read(priv, F81604_SJA1000_ALC, &tmp); if (ret) return ret; ret = f81604_sja1000_read(priv, F81604_SJA1000_IR, &tmp); if (ret) return ret; ret = f81604_register_urbs(priv); if (ret) return ret; ret = f81604_set_normal_mode(priv); if (ret) { f81604_unregister_urbs(priv); return ret; } return 0; } static int f81604_set_bittiming(struct net_device *dev) { struct f81604_port_priv *priv = netdev_priv(dev); struct can_bittiming *bt = &priv->can.bittiming; u8 btr0, btr1; int ret; btr0 = FIELD_PREP(F81604_BRP_MASK, bt->brp - 1) | FIELD_PREP(F81604_SJW_MASK, bt->sjw - 1); btr1 = FIELD_PREP(F81604_SEG1_MASK, bt->prop_seg + bt->phase_seg1 - 1) | FIELD_PREP(F81604_SEG2_MASK, bt->phase_seg2 - 1); if (priv->can.ctrlmode & CAN_CTRLMODE_3_SAMPLES) btr1 |= F81604_SJA1000_BTR1_SAMPLE_TRIPLE; ret = f81604_sja1000_write(priv, F81604_SJA1000_BTR0, btr0); if (ret) { netdev_warn(dev, "%s: Set BTR0 failed: %pe\n", __func__, ERR_PTR(ret)); return ret; } ret = f81604_sja1000_write(priv, F81604_SJA1000_BTR1, btr1); if (ret) { netdev_warn(dev, "%s: Set BTR1 failed: %pe\n", __func__, ERR_PTR(ret)); return ret; } return 0; } static int f81604_set_mode(struct net_device *netdev, enum can_mode mode) { int ret; switch (mode) { case CAN_MODE_START: ret = f81604_start(netdev); if (!ret && netif_queue_stopped(netdev)) netif_wake_queue(netdev); break; default: ret = -EOPNOTSUPP; } return ret; } static void f81604_write_bulk_callback(struct urb *urb) { struct net_device *netdev = urb->context; if (!netif_device_present(netdev)) return; if (urb->status) netdev_info(netdev, "%s: Tx URB error: %pe\n", __func__, ERR_PTR(urb->status)); } static void f81604_clear_reg_work(struct work_struct *work) { struct f81604_port_priv *priv; u8 tmp; priv = container_of(work, struct f81604_port_priv, clear_reg_work); /* dummy read for clear Arbitration lost capture(ALC) register. */ if (test_and_clear_bit(F81604_CLEAR_ALC, &priv->clear_flags)) f81604_sja1000_read(priv, F81604_SJA1000_ALC, &tmp); /* dummy read for clear Error code capture(ECC) register. */ if (test_and_clear_bit(F81604_CLEAR_ECC, &priv->clear_flags)) f81604_sja1000_read(priv, F81604_SJA1000_ECC, &tmp); /* dummy write for clear data overrun flag. */ if (test_and_clear_bit(F81604_CLEAR_OVERRUN, &priv->clear_flags)) f81604_sja1000_write(priv, F81604_SJA1000_CMR, F81604_SJA1000_CMD_CDO); } static netdev_tx_t f81604_start_xmit(struct sk_buff *skb, struct net_device *netdev) { struct can_frame *cf = (struct can_frame *)skb->data; struct f81604_port_priv *priv = netdev_priv(netdev); struct net_device_stats *stats = &netdev->stats; struct f81604_can_frame *frame; struct urb *write_urb; int ret; if (can_dev_dropped_skb(netdev, skb)) return NETDEV_TX_OK; netif_stop_queue(netdev); write_urb = usb_alloc_urb(0, GFP_ATOMIC); if (!write_urb) goto nomem_urb; frame = kzalloc(sizeof(*frame), GFP_ATOMIC); if (!frame) goto nomem_buf; usb_fill_bulk_urb(write_urb, priv->dev, usb_sndbulkpipe(priv->dev, bulk_out_addr[netdev->dev_port]), frame, sizeof(*frame), f81604_write_bulk_callback, priv->netdev); write_urb->transfer_flags |= URB_FREE_BUFFER; frame->cmd = F81604_CMD_DATA; frame->dlc = cf->len; if (cf->can_id & CAN_RTR_FLAG) frame->dlc |= F81604_DLC_RTR_BIT; if (cf->can_id & CAN_EFF_FLAG) { u32 id = (cf->can_id & CAN_EFF_MASK) << F81604_EFF_SHIFT; put_unaligned_be32(id, &frame->eff.id); frame->dlc |= F81604_DLC_EFF_BIT; if (!(cf->can_id & CAN_RTR_FLAG)) memcpy(&frame->eff.data, cf->data, cf->len); } else { u32 id = (cf->can_id & CAN_SFF_MASK) << F81604_SFF_SHIFT; put_unaligned_be16(id, &frame->sff.id); if (!(cf->can_id & CAN_RTR_FLAG)) memcpy(&frame->sff.data, cf->data, cf->len); } can_put_echo_skb(skb, netdev, 0, 0); ret = usb_submit_urb(write_urb, GFP_ATOMIC); if (ret) { netdev_err(netdev, "%s: failed to resubmit tx bulk urb: %pe\n", __func__, ERR_PTR(ret)); can_free_echo_skb(netdev, 0, NULL); stats->tx_dropped++; stats->tx_errors++; if (ret == -ENODEV) netif_device_detach(netdev); else netif_wake_queue(netdev); } /* let usb core take care of this urb */ usb_free_urb(write_urb); return NETDEV_TX_OK; nomem_buf: usb_free_urb(write_urb); nomem_urb: dev_kfree_skb(skb); stats->tx_dropped++; stats->tx_errors++; netif_wake_queue(netdev); return NETDEV_TX_OK; } static int f81604_get_berr_counter(const struct net_device *netdev, struct can_berr_counter *bec) { struct f81604_port_priv *priv = netdev_priv(netdev); u8 txerr, rxerr; int ret; ret = f81604_sja1000_read(priv, F81604_SJA1000_TXERR, &txerr); if (ret) return ret; ret = f81604_sja1000_read(priv, F81604_SJA1000_RXERR, &rxerr); if (ret) return ret; bec->txerr = txerr; bec->rxerr = rxerr; return 0; } /* Open USB device */ static int f81604_open(struct net_device *netdev) { int ret; ret = open_candev(netdev); if (ret) return ret; ret = f81604_start(netdev); if (ret) { if (ret == -ENODEV) netif_device_detach(netdev); close_candev(netdev); return ret; } netif_start_queue(netdev); return 0; } /* Close USB device */ static int f81604_close(struct net_device *netdev) { struct f81604_port_priv *priv = netdev_priv(netdev); f81604_set_reset_mode(priv); netif_stop_queue(netdev); cancel_work_sync(&priv->clear_reg_work); close_candev(netdev); f81604_unregister_urbs(priv); return 0; } static const struct net_device_ops f81604_netdev_ops = { .ndo_open = f81604_open, .ndo_stop = f81604_close, .ndo_start_xmit = f81604_start_xmit, }; static const struct can_bittiming_const f81604_bittiming_const = { .name = KBUILD_MODNAME, .tseg1_min = 1, .tseg1_max = 16, .tseg2_min = 1, .tseg2_max = 8, .sjw_max = 4, .brp_min = 1, .brp_max = 64, .brp_inc = 1, }; /* Called by the usb core when driver is unloaded or device is removed */ static void f81604_disconnect(struct usb_interface *intf) { struct f81604_priv *priv = usb_get_intfdata(intf); int i; for (i = 0; i < ARRAY_SIZE(priv->netdev); ++i) { if (!priv->netdev[i]) continue; unregister_netdev(priv->netdev[i]); free_candev(priv->netdev[i]); } } static int __f81604_set_termination(struct usb_device *dev, int idx, u16 term) { u8 mask, data = 0; if (idx == 0) mask = F81604_CAN0_TERM; else mask = F81604_CAN1_TERM; if (term) data = mask; return f81604_update_bits(dev, F81604_TERMINATOR_REG, mask, data); } static int f81604_set_termination(struct net_device *netdev, u16 term) { struct f81604_port_priv *port_priv = netdev_priv(netdev); ASSERT_RTNL(); return __f81604_set_termination(port_priv->dev, netdev->dev_port, term); } static int f81604_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usb_device *dev = interface_to_usbdev(intf); struct net_device *netdev; struct f81604_priv *priv; int i, ret; priv = devm_kzalloc(&intf->dev, sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; usb_set_intfdata(intf, priv); for (i = 0; i < ARRAY_SIZE(priv->netdev); ++i) { ret = __f81604_set_termination(dev, i, 0); if (ret) { dev_err(&intf->dev, "Setting termination of CH#%d failed: %pe\n", i, ERR_PTR(ret)); return ret; } } for (i = 0; i < ARRAY_SIZE(priv->netdev); ++i) { struct f81604_port_priv *port_priv; netdev = alloc_candev(sizeof(*port_priv), 1); if (!netdev) { dev_err(&intf->dev, "Couldn't alloc candev: %d\n", i); ret = -ENOMEM; goto failure_cleanup; } port_priv = netdev_priv(netdev); INIT_WORK(&port_priv->clear_reg_work, f81604_clear_reg_work); init_usb_anchor(&port_priv->urbs_anchor); port_priv->intf = intf; port_priv->dev = dev; port_priv->netdev = netdev; port_priv->can.clock.freq = F81604_CAN_CLOCK; port_priv->can.termination_const = f81604_termination; port_priv->can.termination_const_cnt = ARRAY_SIZE(f81604_termination); port_priv->can.bittiming_const = &f81604_bittiming_const; port_priv->can.do_set_bittiming = f81604_set_bittiming; port_priv->can.do_set_mode = f81604_set_mode; port_priv->can.do_set_termination = f81604_set_termination; port_priv->can.do_get_berr_counter = f81604_get_berr_counter; port_priv->can.ctrlmode_supported = CAN_CTRLMODE_LISTENONLY | CAN_CTRLMODE_3_SAMPLES | CAN_CTRLMODE_ONE_SHOT | CAN_CTRLMODE_BERR_REPORTING | CAN_CTRLMODE_PRESUME_ACK; netdev->ethtool_ops = &f81604_ethtool_ops; netdev->netdev_ops = &f81604_netdev_ops; netdev->flags |= IFF_ECHO; netdev->dev_port = i; SET_NETDEV_DEV(netdev, &intf->dev); ret = register_candev(netdev); if (ret) { netdev_err(netdev, "register CAN device failed: %pe\n", ERR_PTR(ret)); free_candev(netdev); goto failure_cleanup; } priv->netdev[i] = netdev; } return 0; failure_cleanup: f81604_disconnect(intf); return ret; } static struct usb_driver f81604_driver = { .name = KBUILD_MODNAME, .probe = f81604_probe, .disconnect = f81604_disconnect, .id_table = f81604_table, }; module_usb_driver(f81604_driver); MODULE_AUTHOR("Ji-Ze Hong (Peter Hong) <peter_hong@fintek.com.tw>"); MODULE_DESCRIPTION("Fintek F81604 USB to 2xCANBUS"); MODULE_LICENSE("GPL"); |
| 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 | /* * net/tipc/ib_media.c: Infiniband bearer support for TIPC * * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> * * Based on eth_media.c, which carries the following copyright notice: * * Copyright (c) 2001-2007, Ericsson AB * Copyright (c) 2005-2008, 2011, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <linux/if_infiniband.h> #include "core.h" #include "bearer.h" #define TIPC_MAX_IB_LINK_WIN 500 /* convert InfiniBand address (media address format) media address to string */ static int tipc_ib_addr2str(struct tipc_media_addr *a, char *str_buf, int str_size) { if (str_size < 60) /* 60 = 19 * strlen("xx:") + strlen("xx\0") */ return 1; sprintf(str_buf, "%20phC", a->value); return 0; } /* Convert from media address format to discovery message addr format */ static int tipc_ib_addr2msg(char *msg, struct tipc_media_addr *addr) { memset(msg, 0, TIPC_MEDIA_INFO_SIZE); memcpy(msg, addr->value, INFINIBAND_ALEN); return 0; } /* Convert raw InfiniBand address format to media addr format */ static int tipc_ib_raw2addr(struct tipc_bearer *b, struct tipc_media_addr *addr, const char *msg) { memset(addr, 0, sizeof(*addr)); memcpy(addr->value, msg, INFINIBAND_ALEN); addr->media_id = TIPC_MEDIA_TYPE_IB; addr->broadcast = !memcmp(msg, b->bcast_addr.value, INFINIBAND_ALEN); return 0; } /* Convert discovery msg addr format to InfiniBand media addr format */ static int tipc_ib_msg2addr(struct tipc_bearer *b, struct tipc_media_addr *addr, char *msg) { return tipc_ib_raw2addr(b, addr, msg); } /* InfiniBand media registration info */ struct tipc_media ib_media_info = { .send_msg = tipc_l2_send_msg, .enable_media = tipc_enable_l2_media, .disable_media = tipc_disable_l2_media, .addr2str = tipc_ib_addr2str, .addr2msg = tipc_ib_addr2msg, .msg2addr = tipc_ib_msg2addr, .raw2addr = tipc_ib_raw2addr, .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, .min_win = TIPC_DEF_LINK_WIN, .max_win = TIPC_MAX_IB_LINK_WIN, .type_id = TIPC_MEDIA_TYPE_IB, .hwaddr_len = INFINIBAND_ALEN, .name = "ib" }; |
| 16 1686 1687 1688 1686 13 5 9 9 9 14 14 14 14 2 14 14 14 14 14 14 14 14 11 11 11 14 16 16 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 | // SPDX-License-Identifier: GPL-2.0 /* * usb port device code * * Copyright (C) 2012 Intel Corp * * Author: Lan Tianyu <tianyu.lan@intel.com> */ #include <linux/kstrtox.h> #include <linux/slab.h> #include <linux/string_choices.h> #include <linux/sysfs.h> #include <linux/pm_qos.h> #include <linux/component.h> #include <linux/usb/of.h> #include "hub.h" static int usb_port_block_power_off; static const struct attribute_group *port_dev_group[]; static ssize_t early_stop_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_port *port_dev = to_usb_port(dev); return sysfs_emit(buf, "%s\n", str_yes_no(port_dev->early_stop)); } static ssize_t early_stop_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct usb_port *port_dev = to_usb_port(dev); bool value; if (kstrtobool(buf, &value)) return -EINVAL; if (value) port_dev->early_stop = 1; else port_dev->early_stop = 0; return count; } static DEVICE_ATTR_RW(early_stop); static ssize_t disable_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_port *port_dev = to_usb_port(dev); struct usb_device *hdev = to_usb_device(dev->parent->parent); struct usb_hub *hub = usb_hub_to_struct_hub(hdev); struct usb_interface *intf = to_usb_interface(dev->parent); int port1 = port_dev->portnum; u16 portstatus, unused; bool disabled; int rc; struct kernfs_node *kn; if (!hub) return -ENODEV; hub_get(hub); rc = usb_autopm_get_interface(intf); if (rc < 0) goto out_hub_get; /* * Prevent deadlock if another process is concurrently * trying to unregister hdev. */ kn = sysfs_break_active_protection(&dev->kobj, &attr->attr); if (!kn) { rc = -ENODEV; goto out_autopm; } usb_lock_device(hdev); if (hub->disconnected) { rc = -ENODEV; goto out_hdev_lock; } usb_hub_port_status(hub, port1, &portstatus, &unused); disabled = !usb_port_is_power_on(hub, portstatus); out_hdev_lock: usb_unlock_device(hdev); sysfs_unbreak_active_protection(kn); out_autopm: usb_autopm_put_interface(intf); out_hub_get: hub_put(hub); if (rc) return rc; return sysfs_emit(buf, "%s\n", disabled ? "1" : "0"); } static ssize_t disable_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct usb_port *port_dev = to_usb_port(dev); struct usb_device *hdev = to_usb_device(dev->parent->parent); struct usb_hub *hub = usb_hub_to_struct_hub(hdev); struct usb_interface *intf = to_usb_interface(dev->parent); int port1 = port_dev->portnum; bool disabled; int rc; struct kernfs_node *kn; if (!hub) return -ENODEV; rc = kstrtobool(buf, &disabled); if (rc) return rc; hub_get(hub); rc = usb_autopm_get_interface(intf); if (rc < 0) goto out_hub_get; /* * Prevent deadlock if another process is concurrently * trying to unregister hdev. */ kn = sysfs_break_active_protection(&dev->kobj, &attr->attr); if (!kn) { rc = -ENODEV; goto out_autopm; } usb_lock_device(hdev); if (hub->disconnected) { rc = -ENODEV; goto out_hdev_lock; } if (disabled && port_dev->child) usb_disconnect(&port_dev->child); rc = usb_hub_set_port_power(hdev, hub, port1, !disabled); if (disabled) { usb_clear_port_feature(hdev, port1, USB_PORT_FEAT_C_CONNECTION); if (!port_dev->is_superspeed) usb_clear_port_feature(hdev, port1, USB_PORT_FEAT_C_ENABLE); } if (!rc) rc = count; out_hdev_lock: usb_unlock_device(hdev); sysfs_unbreak_active_protection(kn); out_autopm: usb_autopm_put_interface(intf); out_hub_get: hub_put(hub); return rc; } static DEVICE_ATTR_RW(disable); static ssize_t location_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_port *port_dev = to_usb_port(dev); return sysfs_emit(buf, "0x%08x\n", port_dev->location); } static DEVICE_ATTR_RO(location); static ssize_t connect_type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_port *port_dev = to_usb_port(dev); char *result; switch (port_dev->connect_type) { case USB_PORT_CONNECT_TYPE_HOT_PLUG: result = "hotplug"; break; case USB_PORT_CONNECT_TYPE_HARD_WIRED: result = "hardwired"; break; case USB_PORT_NOT_USED: result = "not used"; break; default: result = "unknown"; break; } return sysfs_emit(buf, "%s\n", result); } static DEVICE_ATTR_RO(connect_type); static ssize_t state_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_port *port_dev = to_usb_port(dev); enum usb_device_state state = READ_ONCE(port_dev->state); return sysfs_emit(buf, "%s\n", usb_state_string(state)); } static DEVICE_ATTR_RO(state); static ssize_t over_current_count_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_port *port_dev = to_usb_port(dev); return sysfs_emit(buf, "%u\n", port_dev->over_current_count); } static DEVICE_ATTR_RO(over_current_count); static ssize_t quirks_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_port *port_dev = to_usb_port(dev); return sysfs_emit(buf, "%08x\n", port_dev->quirks); } static ssize_t quirks_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct usb_port *port_dev = to_usb_port(dev); u32 value; if (kstrtou32(buf, 16, &value)) return -EINVAL; port_dev->quirks = value; return count; } static DEVICE_ATTR_RW(quirks); static ssize_t usb3_lpm_permit_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_port *port_dev = to_usb_port(dev); const char *p; if (port_dev->usb3_lpm_u1_permit) { if (port_dev->usb3_lpm_u2_permit) p = "u1_u2"; else p = "u1"; } else { if (port_dev->usb3_lpm_u2_permit) p = "u2"; else p = "0"; } return sysfs_emit(buf, "%s\n", p); } static ssize_t usb3_lpm_permit_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct usb_port *port_dev = to_usb_port(dev); struct usb_device *udev = port_dev->child; struct usb_hcd *hcd; if (!strncmp(buf, "u1_u2", 5)) { port_dev->usb3_lpm_u1_permit = 1; port_dev->usb3_lpm_u2_permit = 1; } else if (!strncmp(buf, "u1", 2)) { port_dev->usb3_lpm_u1_permit = 1; port_dev->usb3_lpm_u2_permit = 0; } else if (!strncmp(buf, "u2", 2)) { port_dev->usb3_lpm_u1_permit = 0; port_dev->usb3_lpm_u2_permit = 1; } else if (!strncmp(buf, "0", 1)) { port_dev->usb3_lpm_u1_permit = 0; port_dev->usb3_lpm_u2_permit = 0; } else return -EINVAL; /* If device is connected to the port, disable or enable lpm * to make new u1 u2 setting take effect immediately. */ if (udev) { hcd = bus_to_hcd(udev->bus); if (!hcd) return -EINVAL; usb_lock_device(udev); mutex_lock(hcd->bandwidth_mutex); if (!usb_disable_lpm(udev)) usb_enable_lpm(udev); mutex_unlock(hcd->bandwidth_mutex); usb_unlock_device(udev); } return count; } static DEVICE_ATTR_RW(usb3_lpm_permit); static struct attribute *port_dev_attrs[] = { &dev_attr_connect_type.attr, &dev_attr_state.attr, &dev_attr_location.attr, &dev_attr_quirks.attr, &dev_attr_over_current_count.attr, &dev_attr_disable.attr, &dev_attr_early_stop.attr, NULL, }; static const struct attribute_group port_dev_attr_grp = { .attrs = port_dev_attrs, }; static const struct attribute_group *port_dev_group[] = { &port_dev_attr_grp, NULL, }; static struct attribute *port_dev_usb3_attrs[] = { &dev_attr_usb3_lpm_permit.attr, NULL, }; static const struct attribute_group port_dev_usb3_attr_grp = { .attrs = port_dev_usb3_attrs, }; static const struct attribute_group *port_dev_usb3_group[] = { &port_dev_attr_grp, &port_dev_usb3_attr_grp, NULL, }; static void usb_port_device_release(struct device *dev) { struct usb_port *port_dev = to_usb_port(dev); kfree(port_dev->req); kfree(port_dev); } #ifdef CONFIG_PM static int usb_port_runtime_resume(struct device *dev) { struct usb_port *port_dev = to_usb_port(dev); struct usb_device *hdev = to_usb_device(dev->parent->parent); struct usb_interface *intf = to_usb_interface(dev->parent); struct usb_hub *hub = usb_hub_to_struct_hub(hdev); struct usb_device *udev = port_dev->child; struct usb_port *peer = port_dev->peer; int port1 = port_dev->portnum; int retval; if (!hub) return -EINVAL; if (hub->in_reset) { set_bit(port1, hub->power_bits); return 0; } /* * Power on our usb3 peer before this usb2 port to prevent a usb3 * device from degrading to its usb2 connection */ if (!port_dev->is_superspeed && peer) pm_runtime_get_sync(&peer->dev); retval = usb_autopm_get_interface(intf); if (retval < 0) return retval; retval = usb_hub_set_port_power(hdev, hub, port1, true); msleep(hub_power_on_good_delay(hub)); if (udev && !retval) { /* * Our preference is to simply wait for the port to reconnect, * as that is the lowest latency method to restart the port. * However, there are cases where toggling port power results in * the host port and the device port getting out of sync causing * a link training live lock. Upon timeout, flag the port as * needing warm reset recovery (to be performed later by * usb_port_resume() as requested via usb_wakeup_notification()) */ if (hub_port_debounce_be_connected(hub, port1) < 0) { dev_dbg(&port_dev->dev, "reconnect timeout\n"); if (hub_is_superspeed(hdev)) set_bit(port1, hub->warm_reset_bits); } /* Force the child awake to revalidate after the power loss. */ if (!test_and_set_bit(port1, hub->child_usage_bits)) { pm_runtime_get_noresume(&port_dev->dev); pm_request_resume(&udev->dev); } } usb_autopm_put_interface(intf); return retval; } static int usb_port_runtime_suspend(struct device *dev) { struct usb_port *port_dev = to_usb_port(dev); struct usb_device *hdev = to_usb_device(dev->parent->parent); struct usb_interface *intf = to_usb_interface(dev->parent); struct usb_hub *hub = usb_hub_to_struct_hub(hdev); struct usb_port *peer = port_dev->peer; int port1 = port_dev->portnum; int retval; if (!hub) return -EINVAL; if (hub->in_reset) return -EBUSY; if (dev_pm_qos_flags(&port_dev->dev, PM_QOS_FLAG_NO_POWER_OFF) == PM_QOS_FLAGS_ALL) return -EAGAIN; if (usb_port_block_power_off) return -EBUSY; retval = usb_autopm_get_interface(intf); if (retval < 0) return retval; retval = usb_hub_set_port_power(hdev, hub, port1, false); usb_clear_port_feature(hdev, port1, USB_PORT_FEAT_C_CONNECTION); if (!port_dev->is_superspeed) usb_clear_port_feature(hdev, port1, USB_PORT_FEAT_C_ENABLE); usb_autopm_put_interface(intf); /* * Our peer usb3 port may now be able to suspend, so * asynchronously queue a suspend request to observe that this * usb2 port is now off. */ if (!port_dev->is_superspeed && peer) pm_runtime_put(&peer->dev); return retval; } #endif static void usb_port_shutdown(struct device *dev) { struct usb_port *port_dev = to_usb_port(dev); struct usb_device *udev = port_dev->child; if (udev && !udev->port_is_suspended) { usb_disable_usb2_hardware_lpm(udev); usb_unlocked_disable_lpm(udev); } } static const struct dev_pm_ops usb_port_pm_ops = { #ifdef CONFIG_PM .runtime_suspend = usb_port_runtime_suspend, .runtime_resume = usb_port_runtime_resume, #endif }; const struct device_type usb_port_device_type = { .name = "usb_port", .release = usb_port_device_release, .pm = &usb_port_pm_ops, }; static struct device_driver usb_port_driver = { .name = "usb", .owner = THIS_MODULE, .shutdown = usb_port_shutdown, }; static int link_peers(struct usb_port *left, struct usb_port *right) { struct usb_port *ss_port, *hs_port; int rc; if (left->peer == right && right->peer == left) return 0; if (left->peer || right->peer) { struct usb_port *lpeer = left->peer; struct usb_port *rpeer = right->peer; char *method; if (left->location && left->location == right->location) method = "location"; else method = "default"; pr_debug("usb: failed to peer %s and %s by %s (%s:%s) (%s:%s)\n", dev_name(&left->dev), dev_name(&right->dev), method, dev_name(&left->dev), lpeer ? dev_name(&lpeer->dev) : "none", dev_name(&right->dev), rpeer ? dev_name(&rpeer->dev) : "none"); return -EBUSY; } rc = sysfs_create_link(&left->dev.kobj, &right->dev.kobj, "peer"); if (rc) return rc; rc = sysfs_create_link(&right->dev.kobj, &left->dev.kobj, "peer"); if (rc) { sysfs_remove_link(&left->dev.kobj, "peer"); return rc; } /* * We need to wake the HiSpeed port to make sure we don't race * setting ->peer with usb_port_runtime_suspend(). Otherwise we * may miss a suspend event for the SuperSpeed port. */ if (left->is_superspeed) { ss_port = left; WARN_ON(right->is_superspeed); hs_port = right; } else { ss_port = right; WARN_ON(!right->is_superspeed); hs_port = left; } pm_runtime_get_sync(&hs_port->dev); left->peer = right; right->peer = left; /* * The SuperSpeed reference is dropped when the HiSpeed port in * this relationship suspends, i.e. when it is safe to allow a * SuperSpeed connection to drop since there is no risk of a * device degrading to its powered-off HiSpeed connection. * * Also, drop the HiSpeed ref taken above. */ pm_runtime_get_sync(&ss_port->dev); pm_runtime_put(&hs_port->dev); return 0; } static void link_peers_report(struct usb_port *left, struct usb_port *right) { int rc; rc = link_peers(left, right); if (rc == 0) { dev_dbg(&left->dev, "peered to %s\n", dev_name(&right->dev)); } else { dev_dbg(&left->dev, "failed to peer to %s (%d)\n", dev_name(&right->dev), rc); pr_warn_once("usb: port power management may be unreliable\n"); usb_port_block_power_off = 1; } } static void unlink_peers(struct usb_port *left, struct usb_port *right) { struct usb_port *ss_port, *hs_port; WARN(right->peer != left || left->peer != right, "%s and %s are not peers?\n", dev_name(&left->dev), dev_name(&right->dev)); /* * We wake the HiSpeed port to make sure we don't race its * usb_port_runtime_resume() event which takes a SuperSpeed ref * when ->peer is !NULL. */ if (left->is_superspeed) { ss_port = left; hs_port = right; } else { ss_port = right; hs_port = left; } pm_runtime_get_sync(&hs_port->dev); sysfs_remove_link(&left->dev.kobj, "peer"); right->peer = NULL; sysfs_remove_link(&right->dev.kobj, "peer"); left->peer = NULL; /* Drop the SuperSpeed ref held on behalf of the active HiSpeed port */ pm_runtime_put(&ss_port->dev); /* Drop the ref taken above */ pm_runtime_put(&hs_port->dev); } /* * For each usb hub device in the system check to see if it is in the * peer domain of the given port_dev, and if it is check to see if it * has a port that matches the given port by location */ static int match_location(struct usb_device *peer_hdev, void *p) { int port1; struct usb_hcd *hcd, *peer_hcd; struct usb_port *port_dev = p, *peer; struct usb_hub *peer_hub = usb_hub_to_struct_hub(peer_hdev); struct usb_device *hdev = to_usb_device(port_dev->dev.parent->parent); if (!peer_hub || port_dev->connect_type == USB_PORT_NOT_USED) return 0; hcd = bus_to_hcd(hdev->bus); peer_hcd = bus_to_hcd(peer_hdev->bus); /* peer_hcd is provisional until we verify it against the known peer */ if (peer_hcd != hcd->shared_hcd) return 0; for (port1 = 1; port1 <= peer_hdev->maxchild; port1++) { peer = peer_hub->ports[port1 - 1]; if (peer && peer->connect_type != USB_PORT_NOT_USED && peer->location == port_dev->location) { link_peers_report(port_dev, peer); return 1; /* done */ } } return 0; } /* * Find the peer port either via explicit platform firmware "location" * data, the peer hcd for root hubs, or the upstream peer relationship * for all other hubs. */ static void find_and_link_peer(struct usb_hub *hub, int port1) { struct usb_port *port_dev = hub->ports[port1 - 1], *peer; struct usb_device *hdev = hub->hdev; struct usb_device *peer_hdev; struct usb_hub *peer_hub; /* * If location data is available then we can only peer this port * by a location match, not the default peer (lest we create a * situation where we need to go back and undo a default peering * when the port is later peered by location data) */ if (port_dev->location) { /* we link the peer in match_location() if found */ usb_for_each_dev(port_dev, match_location); return; } else if (!hdev->parent) { struct usb_hcd *hcd = bus_to_hcd(hdev->bus); struct usb_hcd *peer_hcd = hcd->shared_hcd; if (!peer_hcd) return; peer_hdev = peer_hcd->self.root_hub; } else { struct usb_port *upstream; struct usb_device *parent = hdev->parent; struct usb_hub *parent_hub = usb_hub_to_struct_hub(parent); if (!parent_hub) return; upstream = parent_hub->ports[hdev->portnum - 1]; if (!upstream || !upstream->peer) return; peer_hdev = upstream->peer->child; } peer_hub = usb_hub_to_struct_hub(peer_hdev); if (!peer_hub || port1 > peer_hdev->maxchild) return; /* * we found a valid default peer, last check is to make sure it * does not have location data */ peer = peer_hub->ports[port1 - 1]; if (peer && peer->location == 0) link_peers_report(port_dev, peer); } static int connector_bind(struct device *dev, struct device *connector, void *data) { struct usb_port *port_dev = to_usb_port(dev); int ret; ret = sysfs_create_link(&dev->kobj, &connector->kobj, "connector"); if (ret) return ret; ret = sysfs_create_link(&connector->kobj, &dev->kobj, dev_name(dev)); if (ret) { sysfs_remove_link(&dev->kobj, "connector"); return ret; } port_dev->connector = data; /* * If there is already USB device connected to the port, letting the * Type-C connector know about it immediately. */ if (port_dev->child) typec_attach(port_dev->connector, &port_dev->child->dev); return 0; } static void connector_unbind(struct device *dev, struct device *connector, void *data) { struct usb_port *port_dev = to_usb_port(dev); sysfs_remove_link(&connector->kobj, dev_name(dev)); sysfs_remove_link(&dev->kobj, "connector"); port_dev->connector = NULL; } static const struct component_ops connector_ops = { .bind = connector_bind, .unbind = connector_unbind, }; int usb_hub_create_port_device(struct usb_hub *hub, int port1) { struct usb_port *port_dev; struct usb_device *hdev = hub->hdev; int retval; port_dev = kzalloc(sizeof(*port_dev), GFP_KERNEL); if (!port_dev) return -ENOMEM; port_dev->req = kzalloc(sizeof(*(port_dev->req)), GFP_KERNEL); if (!port_dev->req) { kfree(port_dev); return -ENOMEM; } port_dev->connect_type = usb_of_get_connect_type(hdev, port1); hub->ports[port1 - 1] = port_dev; port_dev->portnum = port1; set_bit(port1, hub->power_bits); port_dev->dev.parent = hub->intfdev; if (hub_is_superspeed(hdev)) { port_dev->is_superspeed = 1; port_dev->usb3_lpm_u1_permit = 1; port_dev->usb3_lpm_u2_permit = 1; port_dev->dev.groups = port_dev_usb3_group; } else port_dev->dev.groups = port_dev_group; port_dev->dev.type = &usb_port_device_type; port_dev->dev.driver = &usb_port_driver; dev_set_name(&port_dev->dev, "%s-port%d", dev_name(&hub->hdev->dev), port1); mutex_init(&port_dev->status_lock); retval = device_register(&port_dev->dev); if (retval) { put_device(&port_dev->dev); return retval; } port_dev->state_kn = sysfs_get_dirent(port_dev->dev.kobj.sd, "state"); if (!port_dev->state_kn) { dev_err(&port_dev->dev, "failed to sysfs_get_dirent 'state'\n"); retval = -ENODEV; goto err_unregister; } /* Set default policy of port-poweroff disabled. */ retval = dev_pm_qos_add_request(&port_dev->dev, port_dev->req, DEV_PM_QOS_FLAGS, PM_QOS_FLAG_NO_POWER_OFF); if (retval < 0) { goto err_put_kn; } retval = component_add(&port_dev->dev, &connector_ops); if (retval) { dev_warn(&port_dev->dev, "failed to add component\n"); goto err_put_kn; } find_and_link_peer(hub, port1); /* * Enable runtime pm and hold a refernce that hub_configure() * will drop once the PM_QOS_NO_POWER_OFF flag state has been set * and the hub has been fully registered (hdev->maxchild set). */ pm_runtime_set_active(&port_dev->dev); pm_runtime_get_noresume(&port_dev->dev); pm_runtime_enable(&port_dev->dev); device_enable_async_suspend(&port_dev->dev); /* * Keep hidden the ability to enable port-poweroff if the hub * does not support power switching. */ if (!hub_is_port_power_switchable(hub)) return 0; /* Attempt to let userspace take over the policy. */ retval = dev_pm_qos_expose_flags(&port_dev->dev, PM_QOS_FLAG_NO_POWER_OFF); if (retval < 0) { dev_warn(&port_dev->dev, "failed to expose pm_qos_no_poweroff\n"); return 0; } /* Userspace owns the policy, drop the kernel 'no_poweroff' request. */ retval = dev_pm_qos_remove_request(port_dev->req); if (retval >= 0) { kfree(port_dev->req); port_dev->req = NULL; } return 0; err_put_kn: sysfs_put(port_dev->state_kn); err_unregister: device_unregister(&port_dev->dev); return retval; } void usb_hub_remove_port_device(struct usb_hub *hub, int port1) { struct usb_port *port_dev = hub->ports[port1 - 1]; struct usb_port *peer; peer = port_dev->peer; if (peer) unlink_peers(port_dev, peer); component_del(&port_dev->dev, &connector_ops); sysfs_put(port_dev->state_kn); device_unregister(&port_dev->dev); } |
| 25 24 22 22 21 5 24 25 42 41 12 8 24 16 24 11 31 31 27 12 11 11 24 19 1 1 1 11 11 11 5 5 5 4 5 5 5 1 1 13 5 5 25 25 25 25 23 20 14 25 6 15 25 25 25 14 11 5 5 5 5 5 5 1 1 1 1 10 10 10 10 10 1 9 10 10 7 1 1 1 1 1 1 1 11 11 11 11 10 11 11 11 11 11 11 11 11 11 11 11 11 12 12 18 18 18 18 17 18 3 2 1 3 2 2 2 2 26 26 21 21 21 21 22 25 25 11 12 25 24 25 25 22 25 24 25 25 24 2 2 24 24 24 1 1 15 22 1 15 21 1 17 15 23 16 16 16 18 17 18 18 17 18 7 3 3 3 3 2 13 1 12 1 1 12 12 12 12 12 12 12 13 13 13 13 13 13 13 13 12 13 13 13 13 13 13 12 9 9 10 1 10 10 10 9 10 13 13 1 37 35 36 36 36 35 37 37 36 21 21 21 21 21 21 21 21 21 21 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 1 1 1 1 1 4 4 4 2 2 2 2 2 2 2 1 1 1 1 10 32 32 32 11 11 11 10 10 7 10 7 7 6 6 1 1 1 1 1 1 2 2 28 13 14 14 1 1 1 1 1 1 13 11 11 11 11 11 11 11 11 3 11 11 11 6 6 6 6 6 6 6 6 13 13 13 12 1 1 13 13 13 13 13 4 13 13 11 11 11 11 1 1 12 2 2 2 1 2 2 2 2 2 2 2 2 2 1 1 2 2 5 3 5 5 4 5 2 2 1 5 18 18 4 14 18 15 8 15 2 2 14 13 13 11 11 1 1 11 11 11 11 11 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. * * The iopt_pages is the center of the storage and motion of PFNs. Each * iopt_pages represents a logical linear array of full PFNs. The array is 0 * based and has npages in it. Accessors use 'index' to refer to the entry in * this logical array, regardless of its storage location. * * PFNs are stored in a tiered scheme: * 1) iopt_pages::pinned_pfns xarray * 2) An iommu_domain * 3) The origin of the PFNs, i.e. the userspace pointer * * PFN have to be copied between all combinations of tiers, depending on the * configuration. * * When a PFN is taken out of the userspace pointer it is pinned exactly once. * The storage locations of the PFN's index are tracked in the two interval * trees. If no interval includes the index then it is not pinned. * * If access_itree includes the PFN's index then an in-kernel access has * requested the page. The PFN is stored in the xarray so other requestors can * continue to find it. * * If the domains_itree includes the PFN's index then an iommu_domain is storing * the PFN and it can be read back using iommu_iova_to_phys(). To avoid * duplicating storage the xarray is not used if only iommu_domains are using * the PFN's index. * * As a general principle this is designed so that destroy never fails. This * means removing an iommu_domain or releasing a in-kernel access will not fail * due to insufficient memory. In practice this means some cases have to hold * PFNs in the xarray even though they are also being stored in an iommu_domain. * * While the iopt_pages can use an iommu_domain as storage, it does not have an * IOVA itself. Instead the iopt_area represents a range of IOVA and uses the * iopt_pages as the PFN provider. Multiple iopt_areas can share the iopt_pages * and reference their own slice of the PFN array, with sub page granularity. * * In this file the term 'last' indicates an inclusive and closed interval, eg * [0,0] refers to a single PFN. 'end' means an open range, eg [0,0) refers to * no PFNs. * * Be cautious of overflow. An IOVA can go all the way up to U64_MAX, so * last_iova + 1 can overflow. An iopt_pages index will always be much less than * ULONG_MAX so last_index + 1 cannot overflow. */ #include <linux/dma-buf.h> #include <linux/dma-resv.h> #include <linux/file.h> #include <linux/highmem.h> #include <linux/iommu.h> #include <linux/iommufd.h> #include <linux/kthread.h> #include <linux/overflow.h> #include <linux/slab.h> #include <linux/sched/mm.h> #include <linux/vfio_pci_core.h> #include "double_span.h" #include "io_pagetable.h" #ifndef CONFIG_IOMMUFD_TEST #define TEMP_MEMORY_LIMIT 65536 #else #define TEMP_MEMORY_LIMIT iommufd_test_memory_limit #endif #define BATCH_BACKUP_SIZE 32 /* * More memory makes pin_user_pages() and the batching more efficient, but as * this is only a performance optimization don't try too hard to get it. A 64k * allocation can hold about 26M of 4k pages and 13G of 2M pages in an * pfn_batch. Various destroy paths cannot fail and provide a small amount of * stack memory as a backup contingency. If backup_len is given this cannot * fail. */ static void *temp_kmalloc(size_t *size, void *backup, size_t backup_len) { void *res; if (WARN_ON(*size == 0)) return NULL; if (*size < backup_len) return backup; if (!backup && iommufd_should_fail()) return NULL; *size = min_t(size_t, *size, TEMP_MEMORY_LIMIT); res = kmalloc(*size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); if (res) return res; *size = PAGE_SIZE; if (backup_len) { res = kmalloc(*size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); if (res) return res; *size = backup_len; return backup; } return kmalloc(*size, GFP_KERNEL); } void interval_tree_double_span_iter_update( struct interval_tree_double_span_iter *iter) { unsigned long last_hole = ULONG_MAX; unsigned int i; for (i = 0; i != ARRAY_SIZE(iter->spans); i++) { if (interval_tree_span_iter_done(&iter->spans[i])) { iter->is_used = -1; return; } if (iter->spans[i].is_hole) { last_hole = min(last_hole, iter->spans[i].last_hole); continue; } iter->is_used = i + 1; iter->start_used = iter->spans[i].start_used; iter->last_used = min(iter->spans[i].last_used, last_hole); return; } iter->is_used = 0; iter->start_hole = iter->spans[0].start_hole; iter->last_hole = min(iter->spans[0].last_hole, iter->spans[1].last_hole); } void interval_tree_double_span_iter_first( struct interval_tree_double_span_iter *iter, struct rb_root_cached *itree1, struct rb_root_cached *itree2, unsigned long first_index, unsigned long last_index) { unsigned int i; iter->itrees[0] = itree1; iter->itrees[1] = itree2; for (i = 0; i != ARRAY_SIZE(iter->spans); i++) interval_tree_span_iter_first(&iter->spans[i], iter->itrees[i], first_index, last_index); interval_tree_double_span_iter_update(iter); } void interval_tree_double_span_iter_next( struct interval_tree_double_span_iter *iter) { unsigned int i; if (iter->is_used == -1 || iter->last_hole == iter->spans[0].last_index) { iter->is_used = -1; return; } for (i = 0; i != ARRAY_SIZE(iter->spans); i++) interval_tree_span_iter_advance( &iter->spans[i], iter->itrees[i], iter->last_hole + 1); interval_tree_double_span_iter_update(iter); } static void iopt_pages_add_npinned(struct iopt_pages *pages, size_t npages) { int rc; rc = check_add_overflow(pages->npinned, npages, &pages->npinned); if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) WARN_ON(rc || pages->npinned > pages->npages); } static void iopt_pages_sub_npinned(struct iopt_pages *pages, size_t npages) { int rc; rc = check_sub_overflow(pages->npinned, npages, &pages->npinned); if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) WARN_ON(rc || pages->npinned > pages->npages); } static void iopt_pages_err_unpin(struct iopt_pages *pages, unsigned long start_index, unsigned long last_index, struct page **page_list) { unsigned long npages = last_index - start_index + 1; unpin_user_pages(page_list, npages); iopt_pages_sub_npinned(pages, npages); } /* * index is the number of PAGE_SIZE units from the start of the area's * iopt_pages. If the iova is sub page-size then the area has an iova that * covers a portion of the first and last pages in the range. */ static unsigned long iopt_area_index_to_iova(struct iopt_area *area, unsigned long index) { if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) WARN_ON(index < iopt_area_index(area) || index > iopt_area_last_index(area)); index -= iopt_area_index(area); if (index == 0) return iopt_area_iova(area); return iopt_area_iova(area) - area->page_offset + index * PAGE_SIZE; } static unsigned long iopt_area_index_to_iova_last(struct iopt_area *area, unsigned long index) { if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) WARN_ON(index < iopt_area_index(area) || index > iopt_area_last_index(area)); if (index == iopt_area_last_index(area)) return iopt_area_last_iova(area); return iopt_area_iova(area) - area->page_offset + (index - iopt_area_index(area) + 1) * PAGE_SIZE - 1; } static void iommu_unmap_nofail(struct iommu_domain *domain, unsigned long iova, size_t size) { size_t ret; ret = iommu_unmap(domain, iova, size); /* * It is a logic error in this code or a driver bug if the IOMMU unmaps * something other than exactly as requested. This implies that the * iommu driver may not fail unmap for reasons beyond bad agruments. * Particularly, the iommu driver may not do a memory allocation on the * unmap path. */ WARN_ON(ret != size); } static void iopt_area_unmap_domain_range(struct iopt_area *area, struct iommu_domain *domain, unsigned long start_index, unsigned long last_index) { unsigned long start_iova = iopt_area_index_to_iova(area, start_index); iommu_unmap_nofail(domain, start_iova, iopt_area_index_to_iova_last(area, last_index) - start_iova + 1); } static struct iopt_area *iopt_pages_find_domain_area(struct iopt_pages *pages, unsigned long index) { struct interval_tree_node *node; node = interval_tree_iter_first(&pages->domains_itree, index, index); if (!node) return NULL; return container_of(node, struct iopt_area, pages_node); } enum batch_kind { BATCH_CPU_MEMORY = 0, BATCH_MMIO, }; /* * A simple datastructure to hold a vector of PFNs, optimized for contiguous * PFNs. This is used as a temporary holding memory for shuttling pfns from one * place to another. Generally everything is made more efficient if operations * work on the largest possible grouping of pfns. eg fewer lock/unlock cycles, * better cache locality, etc */ struct pfn_batch { unsigned long *pfns; u32 *npfns; unsigned int array_size; unsigned int end; unsigned int total_pfns; enum batch_kind kind; }; enum { MAX_NPFNS = type_max(typeof(((struct pfn_batch *)0)->npfns[0])) }; static void batch_clear(struct pfn_batch *batch) { batch->total_pfns = 0; batch->end = 0; batch->pfns[0] = 0; batch->npfns[0] = 0; batch->kind = 0; } /* * Carry means we carry a portion of the final hugepage over to the front of the * batch */ static void batch_clear_carry(struct pfn_batch *batch, unsigned int keep_pfns) { if (!keep_pfns) return batch_clear(batch); if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) WARN_ON(!batch->end || batch->npfns[batch->end - 1] < keep_pfns); batch->total_pfns = keep_pfns; batch->pfns[0] = batch->pfns[batch->end - 1] + (batch->npfns[batch->end - 1] - keep_pfns); batch->npfns[0] = keep_pfns; batch->end = 1; } static void batch_skip_carry(struct pfn_batch *batch, unsigned int skip_pfns) { if (!batch->total_pfns) return; if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) WARN_ON(batch->total_pfns != batch->npfns[0]); skip_pfns = min(batch->total_pfns, skip_pfns); batch->pfns[0] += skip_pfns; batch->npfns[0] -= skip_pfns; batch->total_pfns -= skip_pfns; } static int __batch_init(struct pfn_batch *batch, size_t max_pages, void *backup, size_t backup_len) { const size_t elmsz = sizeof(*batch->pfns) + sizeof(*batch->npfns); size_t size = max_pages * elmsz; batch->pfns = temp_kmalloc(&size, backup, backup_len); if (!batch->pfns) return -ENOMEM; if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && WARN_ON(size < elmsz)) return -EINVAL; batch->array_size = size / elmsz; batch->npfns = (u32 *)(batch->pfns + batch->array_size); batch_clear(batch); return 0; } static int batch_init(struct pfn_batch *batch, size_t max_pages) { return __batch_init(batch, max_pages, NULL, 0); } static void batch_init_backup(struct pfn_batch *batch, size_t max_pages, void *backup, size_t backup_len) { __batch_init(batch, max_pages, backup, backup_len); } static void batch_destroy(struct pfn_batch *batch, void *backup) { if (batch->pfns != backup) kfree(batch->pfns); } static bool batch_add_pfn_num(struct pfn_batch *batch, unsigned long pfn, u32 nr, enum batch_kind kind) { unsigned int end = batch->end; if (batch->kind != kind) { /* One kind per batch */ if (batch->end != 0) return false; batch->kind = kind; } if (end && pfn == batch->pfns[end - 1] + batch->npfns[end - 1] && nr <= MAX_NPFNS - batch->npfns[end - 1]) { batch->npfns[end - 1] += nr; } else if (end < batch->array_size) { batch->pfns[end] = pfn; batch->npfns[end] = nr; batch->end++; } else { return false; } batch->total_pfns += nr; return true; } static void batch_remove_pfn_num(struct pfn_batch *batch, unsigned long nr) { batch->npfns[batch->end - 1] -= nr; if (batch->npfns[batch->end - 1] == 0) batch->end--; batch->total_pfns -= nr; } /* true if the pfn was added, false otherwise */ static bool batch_add_pfn(struct pfn_batch *batch, unsigned long pfn) { return batch_add_pfn_num(batch, pfn, 1, BATCH_CPU_MEMORY); } /* * Fill the batch with pfns from the domain. When the batch is full, or it * reaches last_index, the function will return. The caller should use * batch->total_pfns to determine the starting point for the next iteration. */ static void batch_from_domain(struct pfn_batch *batch, struct iommu_domain *domain, struct iopt_area *area, unsigned long start_index, unsigned long last_index) { unsigned int page_offset = 0; unsigned long iova; phys_addr_t phys; iova = iopt_area_index_to_iova(area, start_index); if (start_index == iopt_area_index(area)) page_offset = area->page_offset; while (start_index <= last_index) { /* * This is pretty slow, it would be nice to get the page size * back from the driver, or have the driver directly fill the * batch. */ phys = iommu_iova_to_phys(domain, iova) - page_offset; if (!batch_add_pfn(batch, PHYS_PFN(phys))) return; iova += PAGE_SIZE - page_offset; page_offset = 0; start_index++; } } static struct page **raw_pages_from_domain(struct iommu_domain *domain, struct iopt_area *area, unsigned long start_index, unsigned long last_index, struct page **out_pages) { unsigned int page_offset = 0; unsigned long iova; phys_addr_t phys; iova = iopt_area_index_to_iova(area, start_index); if (start_index == iopt_area_index(area)) page_offset = area->page_offset; while (start_index <= last_index) { phys = iommu_iova_to_phys(domain, iova) - page_offset; *(out_pages++) = pfn_to_page(PHYS_PFN(phys)); iova += PAGE_SIZE - page_offset; page_offset = 0; start_index++; } return out_pages; } /* Continues reading a domain until we reach a discontinuity in the pfns. */ static void batch_from_domain_continue(struct pfn_batch *batch, struct iommu_domain *domain, struct iopt_area *area, unsigned long start_index, unsigned long last_index) { unsigned int array_size = batch->array_size; batch->array_size = batch->end; batch_from_domain(batch, domain, area, start_index, last_index); batch->array_size = array_size; } /* * This is part of the VFIO compatibility support for VFIO_TYPE1_IOMMU. That * mode permits splitting a mapped area up, and then one of the splits is * unmapped. Doing this normally would cause us to violate our invariant of * pairing map/unmap. Thus, to support old VFIO compatibility disable support * for batching consecutive PFNs. All PFNs mapped into the iommu are done in * PAGE_SIZE units, not larger or smaller. */ static int batch_iommu_map_small(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot) { unsigned long start_iova = iova; int rc; if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) WARN_ON(paddr % PAGE_SIZE || iova % PAGE_SIZE || size % PAGE_SIZE); while (size) { rc = iommu_map(domain, iova, paddr, PAGE_SIZE, prot, GFP_KERNEL_ACCOUNT); if (rc) goto err_unmap; iova += PAGE_SIZE; paddr += PAGE_SIZE; size -= PAGE_SIZE; } return 0; err_unmap: if (start_iova != iova) iommu_unmap_nofail(domain, start_iova, iova - start_iova); return rc; } static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain, struct iopt_area *area, unsigned long start_index) { bool disable_large_pages = area->iopt->disable_large_pages; unsigned long last_iova = iopt_area_last_iova(area); int iommu_prot = area->iommu_prot; unsigned int page_offset = 0; unsigned long start_iova; unsigned long next_iova; unsigned int cur = 0; unsigned long iova; int rc; if (batch->kind == BATCH_MMIO) { iommu_prot &= ~IOMMU_CACHE; iommu_prot |= IOMMU_MMIO; } /* The first index might be a partial page */ if (start_index == iopt_area_index(area)) page_offset = area->page_offset; next_iova = iova = start_iova = iopt_area_index_to_iova(area, start_index); while (cur < batch->end) { next_iova = min(last_iova + 1, next_iova + batch->npfns[cur] * PAGE_SIZE - page_offset); if (disable_large_pages) rc = batch_iommu_map_small( domain, iova, PFN_PHYS(batch->pfns[cur]) + page_offset, next_iova - iova, iommu_prot); else rc = iommu_map(domain, iova, PFN_PHYS(batch->pfns[cur]) + page_offset, next_iova - iova, iommu_prot, GFP_KERNEL_ACCOUNT); if (rc) goto err_unmap; iova = next_iova; page_offset = 0; cur++; } return 0; err_unmap: if (start_iova != iova) iommu_unmap_nofail(domain, start_iova, iova - start_iova); return rc; } static void batch_from_xarray(struct pfn_batch *batch, struct xarray *xa, unsigned long start_index, unsigned long last_index) { XA_STATE(xas, xa, start_index); void *entry; rcu_read_lock(); while (true) { entry = xas_next(&xas); if (xas_retry(&xas, entry)) continue; WARN_ON(!xa_is_value(entry)); if (!batch_add_pfn(batch, xa_to_value(entry)) || start_index == last_index) break; start_index++; } rcu_read_unlock(); } static void batch_from_xarray_clear(struct pfn_batch *batch, struct xarray *xa, unsigned long start_index, unsigned long last_index) { XA_STATE(xas, xa, start_index); void *entry; xas_lock(&xas); while (true) { entry = xas_next(&xas); if (xas_retry(&xas, entry)) continue; WARN_ON(!xa_is_value(entry)); if (!batch_add_pfn(batch, xa_to_value(entry))) break; xas_store(&xas, NULL); if (start_index == last_index) break; start_index++; } xas_unlock(&xas); } static void clear_xarray(struct xarray *xa, unsigned long start_index, unsigned long last_index) { XA_STATE(xas, xa, start_index); void *entry; xas_lock(&xas); xas_for_each(&xas, entry, last_index) xas_store(&xas, NULL); xas_unlock(&xas); } static int pages_to_xarray(struct xarray *xa, unsigned long start_index, unsigned long last_index, struct page **pages) { struct page **end_pages = pages + (last_index - start_index) + 1; struct page **half_pages = pages + (end_pages - pages) / 2; XA_STATE(xas, xa, start_index); do { void *old; xas_lock(&xas); while (pages != end_pages) { /* xarray does not participate in fault injection */ if (pages == half_pages && iommufd_should_fail()) { xas_set_err(&xas, -EINVAL); xas_unlock(&xas); /* aka xas_destroy() */ xas_nomem(&xas, GFP_KERNEL); goto err_clear; } old = xas_store(&xas, xa_mk_value(page_to_pfn(*pages))); if (xas_error(&xas)) break; WARN_ON(old); pages++; xas_next(&xas); } xas_unlock(&xas); } while (xas_nomem(&xas, GFP_KERNEL)); err_clear: if (xas_error(&xas)) { if (xas.xa_index != start_index) clear_xarray(xa, start_index, xas.xa_index - 1); return xas_error(&xas); } return 0; } static void batch_from_pages(struct pfn_batch *batch, struct page **pages, size_t npages) { struct page **end = pages + npages; for (; pages != end; pages++) if (!batch_add_pfn(batch, page_to_pfn(*pages))) break; } static int batch_from_folios(struct pfn_batch *batch, struct folio ***folios_p, unsigned long *offset_p, unsigned long npages) { int rc = 0; struct folio **folios = *folios_p; unsigned long offset = *offset_p; while (npages) { struct folio *folio = *folios; unsigned long nr = folio_nr_pages(folio) - offset; unsigned long pfn = page_to_pfn(folio_page(folio, offset)); nr = min(nr, npages); npages -= nr; if (!batch_add_pfn_num(batch, pfn, nr, BATCH_CPU_MEMORY)) break; if (nr > 1) { rc = folio_add_pins(folio, nr - 1); if (rc) { batch_remove_pfn_num(batch, nr); goto out; } } folios++; offset = 0; } out: *folios_p = folios; *offset_p = offset; return rc; } static void batch_unpin(struct pfn_batch *batch, struct iopt_pages *pages, unsigned int first_page_off, size_t npages) { unsigned int cur = 0; while (first_page_off) { if (batch->npfns[cur] > first_page_off) break; first_page_off -= batch->npfns[cur]; cur++; } while (npages) { size_t to_unpin = min_t(size_t, npages, batch->npfns[cur] - first_page_off); unpin_user_page_range_dirty_lock( pfn_to_page(batch->pfns[cur] + first_page_off), to_unpin, pages->writable); iopt_pages_sub_npinned(pages, to_unpin); cur++; first_page_off = 0; npages -= to_unpin; } } static void copy_data_page(struct page *page, void *data, unsigned long offset, size_t length, unsigned int flags) { void *mem; mem = kmap_local_page(page); if (flags & IOMMUFD_ACCESS_RW_WRITE) { memcpy(mem + offset, data, length); set_page_dirty_lock(page); } else { memcpy(data, mem + offset, length); } kunmap_local(mem); } static unsigned long batch_rw(struct pfn_batch *batch, void *data, unsigned long offset, unsigned long length, unsigned int flags) { unsigned long copied = 0; unsigned int npage = 0; unsigned int cur = 0; while (cur < batch->end) { unsigned long bytes = min(length, PAGE_SIZE - offset); copy_data_page(pfn_to_page(batch->pfns[cur] + npage), data, offset, bytes, flags); offset = 0; length -= bytes; data += bytes; copied += bytes; npage++; if (npage == batch->npfns[cur]) { npage = 0; cur++; } if (!length) break; } return copied; } /* pfn_reader_user is just the pin_user_pages() path */ struct pfn_reader_user { struct page **upages; size_t upages_len; unsigned long upages_start; unsigned long upages_end; unsigned int gup_flags; /* * 1 means mmget() and mmap_read_lock(), 0 means only mmget(), -1 is * neither */ int locked; /* The following are only valid if file != NULL. */ struct file *file; struct folio **ufolios; size_t ufolios_len; unsigned long ufolios_offset; struct folio **ufolios_next; }; static void pfn_reader_user_init(struct pfn_reader_user *user, struct iopt_pages *pages) { user->upages = NULL; user->upages_len = 0; user->upages_start = 0; user->upages_end = 0; user->locked = -1; user->gup_flags = FOLL_LONGTERM; if (pages->writable) user->gup_flags |= FOLL_WRITE; user->file = (pages->type == IOPT_ADDRESS_FILE) ? pages->file : NULL; user->ufolios = NULL; user->ufolios_len = 0; user->ufolios_next = NULL; user->ufolios_offset = 0; } static void pfn_reader_user_destroy(struct pfn_reader_user *user, struct iopt_pages *pages) { if (user->locked != -1) { if (user->locked) mmap_read_unlock(pages->source_mm); if (!user->file && pages->source_mm != current->mm) mmput(pages->source_mm); user->locked = -1; } kfree(user->upages); user->upages = NULL; kfree(user->ufolios); user->ufolios = NULL; } static long pin_memfd_pages(struct pfn_reader_user *user, unsigned long start, unsigned long npages) { unsigned long i; unsigned long offset; unsigned long npages_out = 0; struct page **upages = user->upages; unsigned long end = start + (npages << PAGE_SHIFT) - 1; long nfolios = user->ufolios_len / sizeof(*user->ufolios); /* * todo: memfd_pin_folios should return the last pinned offset so * we can compute npages pinned, and avoid looping over folios here * if upages == NULL. */ nfolios = memfd_pin_folios(user->file, start, end, user->ufolios, nfolios, &offset); if (nfolios <= 0) return nfolios; offset >>= PAGE_SHIFT; user->ufolios_next = user->ufolios; user->ufolios_offset = offset; for (i = 0; i < nfolios; i++) { struct folio *folio = user->ufolios[i]; unsigned long nr = folio_nr_pages(folio); unsigned long npin = min(nr - offset, npages); npages -= npin; npages_out += npin; if (upages) { if (npin == 1) { *upages++ = folio_page(folio, offset); } else { int rc = folio_add_pins(folio, npin - 1); if (rc) return rc; while (npin--) *upages++ = folio_page(folio, offset++); } } offset = 0; } return npages_out; } static int pfn_reader_user_pin(struct pfn_reader_user *user, struct iopt_pages *pages, unsigned long start_index, unsigned long last_index) { bool remote_mm = pages->source_mm != current->mm; unsigned long npages = last_index - start_index + 1; unsigned long start; unsigned long unum; uintptr_t uptr; long rc; if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && WARN_ON(last_index < start_index)) return -EINVAL; if (!user->file && !user->upages) { /* All undone in pfn_reader_destroy() */ user->upages_len = npages * sizeof(*user->upages); user->upages = temp_kmalloc(&user->upages_len, NULL, 0); if (!user->upages) return -ENOMEM; } if (user->file && !user->ufolios) { user->ufolios_len = npages * sizeof(*user->ufolios); user->ufolios = temp_kmalloc(&user->ufolios_len, NULL, 0); if (!user->ufolios) return -ENOMEM; } if (user->locked == -1) { /* * The majority of usages will run the map task within the mm * providing the pages, so we can optimize into * get_user_pages_fast() */ if (!user->file && remote_mm) { if (!mmget_not_zero(pages->source_mm)) return -EFAULT; } user->locked = 0; } unum = user->file ? user->ufolios_len / sizeof(*user->ufolios) : user->upages_len / sizeof(*user->upages); npages = min_t(unsigned long, npages, unum); if (iommufd_should_fail()) return -EFAULT; if (user->file) { start = pages->start + (start_index * PAGE_SIZE); rc = pin_memfd_pages(user, start, npages); } else if (!remote_mm) { uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE); rc = pin_user_pages_fast(uptr, npages, user->gup_flags, user->upages); } else { uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE); if (!user->locked) { mmap_read_lock(pages->source_mm); user->locked = 1; } rc = pin_user_pages_remote(pages->source_mm, uptr, npages, user->gup_flags, user->upages, &user->locked); } if (rc <= 0) { if (WARN_ON(!rc)) return -EFAULT; return rc; } iopt_pages_add_npinned(pages, rc); user->upages_start = start_index; user->upages_end = start_index + rc; return 0; } /* This is the "modern" and faster accounting method used by io_uring */ static int incr_user_locked_vm(struct iopt_pages *pages, unsigned long npages) { unsigned long lock_limit; unsigned long cur_pages; unsigned long new_pages; lock_limit = task_rlimit(pages->source_task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; cur_pages = atomic_long_read(&pages->source_user->locked_vm); do { new_pages = cur_pages + npages; if (new_pages > lock_limit) return -ENOMEM; } while (!atomic_long_try_cmpxchg(&pages->source_user->locked_vm, &cur_pages, new_pages)); return 0; } static void decr_user_locked_vm(struct iopt_pages *pages, unsigned long npages) { if (WARN_ON(atomic_long_read(&pages->source_user->locked_vm) < npages)) return; atomic_long_sub(npages, &pages->source_user->locked_vm); } /* This is the accounting method used for compatibility with VFIO */ static int update_mm_locked_vm(struct iopt_pages *pages, unsigned long npages, bool inc, struct pfn_reader_user *user) { bool do_put = false; int rc; if (user && user->locked) { mmap_read_unlock(pages->source_mm); user->locked = 0; /* If we had the lock then we also have a get */ } else if ((!user || (!user->upages && !user->ufolios)) && pages->source_mm != current->mm) { if (!mmget_not_zero(pages->source_mm)) return -EINVAL; do_put = true; } mmap_write_lock(pages->source_mm); rc = __account_locked_vm(pages->source_mm, npages, inc, pages->source_task, false); mmap_write_unlock(pages->source_mm); if (do_put) mmput(pages->source_mm); return rc; } int iopt_pages_update_pinned(struct iopt_pages *pages, unsigned long npages, bool inc, struct pfn_reader_user *user) { int rc = 0; switch (pages->account_mode) { case IOPT_PAGES_ACCOUNT_NONE: break; case IOPT_PAGES_ACCOUNT_USER: if (inc) rc = incr_user_locked_vm(pages, npages); else decr_user_locked_vm(pages, npages); break; case IOPT_PAGES_ACCOUNT_MM: rc = update_mm_locked_vm(pages, npages, inc, user); break; } if (rc) return rc; pages->last_npinned = pages->npinned; if (inc) atomic64_add(npages, &pages->source_mm->pinned_vm); else atomic64_sub(npages, &pages->source_mm->pinned_vm); return 0; } static void update_unpinned(struct iopt_pages *pages) { if (WARN_ON(pages->npinned > pages->last_npinned)) return; if (pages->npinned == pages->last_npinned) return; iopt_pages_update_pinned(pages, pages->last_npinned - pages->npinned, false, NULL); } /* * Changes in the number of pages pinned is done after the pages have been read * and processed. If the user lacked the limit then the error unwind will unpin * everything that was just pinned. This is because it is expensive to calculate * how many pages we have already pinned within a range to generate an accurate * prediction in advance of doing the work to actually pin them. */ static int pfn_reader_user_update_pinned(struct pfn_reader_user *user, struct iopt_pages *pages) { unsigned long npages; bool inc; lockdep_assert_held(&pages->mutex); if (pages->npinned == pages->last_npinned) return 0; if (pages->npinned < pages->last_npinned) { npages = pages->last_npinned - pages->npinned; inc = false; } else { if (iommufd_should_fail()) return -ENOMEM; npages = pages->npinned - pages->last_npinned; inc = true; } return iopt_pages_update_pinned(pages, npages, inc, user); } struct pfn_reader_dmabuf { struct dma_buf_phys_vec phys; unsigned long start_offset; }; static int pfn_reader_dmabuf_init(struct pfn_reader_dmabuf *dmabuf, struct iopt_pages *pages) { /* Callers must not get here if the dmabuf was already revoked */ if (WARN_ON(iopt_dmabuf_revoked(pages))) return -EINVAL; dmabuf->phys = pages->dmabuf.phys; dmabuf->start_offset = pages->dmabuf.start; return 0; } static int pfn_reader_fill_dmabuf(struct pfn_reader_dmabuf *dmabuf, struct pfn_batch *batch, unsigned long start_index, unsigned long last_index) { unsigned long start = dmabuf->start_offset + start_index * PAGE_SIZE; /* * start/last_index and start are all PAGE_SIZE aligned, the batch is * always filled using page size aligned PFNs just like the other types. * If the dmabuf has been sliced on a sub page offset then the common * batch to domain code will adjust it before mapping to the domain. */ batch_add_pfn_num(batch, PHYS_PFN(dmabuf->phys.paddr + start), last_index - start_index + 1, BATCH_MMIO); return 0; } /* * PFNs are stored in three places, in order of preference: * - The iopt_pages xarray. This is only populated if there is a * iopt_pages_access * - The iommu_domain under an area * - The original PFN source, ie pages->source_mm * * This iterator reads the pfns optimizing to load according to the * above order. */ struct pfn_reader { struct iopt_pages *pages; struct interval_tree_double_span_iter span; struct pfn_batch batch; unsigned long batch_start_index; unsigned long batch_end_index; unsigned long last_index; union { struct pfn_reader_user user; struct pfn_reader_dmabuf dmabuf; }; }; static int pfn_reader_update_pinned(struct pfn_reader *pfns) { return pfn_reader_user_update_pinned(&pfns->user, pfns->pages); } /* * The batch can contain a mixture of pages that are still in use and pages that * need to be unpinned. Unpin only pages that are not held anywhere else. */ static void pfn_reader_unpin(struct pfn_reader *pfns) { unsigned long last = pfns->batch_end_index - 1; unsigned long start = pfns->batch_start_index; struct interval_tree_double_span_iter span; struct iopt_pages *pages = pfns->pages; lockdep_assert_held(&pages->mutex); interval_tree_for_each_double_span(&span, &pages->access_itree, &pages->domains_itree, start, last) { if (span.is_used) continue; batch_unpin(&pfns->batch, pages, span.start_hole - start, span.last_hole - span.start_hole + 1); } } /* Process a single span to load it from the proper storage */ static int pfn_reader_fill_span(struct pfn_reader *pfns) { struct interval_tree_double_span_iter *span = &pfns->span; unsigned long start_index = pfns->batch_end_index; struct pfn_reader_user *user; unsigned long npages; struct iopt_area *area; int rc; if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && WARN_ON(span->last_used < start_index)) return -EINVAL; if (span->is_used == 1) { batch_from_xarray(&pfns->batch, &pfns->pages->pinned_pfns, start_index, span->last_used); return 0; } if (span->is_used == 2) { /* * Pull as many pages from the first domain we find in the * target span. If it is too small then we will be called again * and we'll find another area. */ area = iopt_pages_find_domain_area(pfns->pages, start_index); if (WARN_ON(!area)) return -EINVAL; /* The storage_domain cannot change without the pages mutex */ batch_from_domain( &pfns->batch, area->storage_domain, area, start_index, min(iopt_area_last_index(area), span->last_used)); return 0; } if (iopt_is_dmabuf(pfns->pages)) return pfn_reader_fill_dmabuf(&pfns->dmabuf, &pfns->batch, start_index, span->last_hole); user = &pfns->user; if (start_index >= user->upages_end) { rc = pfn_reader_user_pin(user, pfns->pages, start_index, span->last_hole); if (rc) return rc; } npages = user->upages_end - start_index; start_index -= user->upages_start; rc = 0; if (!user->file) batch_from_pages(&pfns->batch, user->upages + start_index, npages); else rc = batch_from_folios(&pfns->batch, &user->ufolios_next, &user->ufolios_offset, npages); return rc; } static bool pfn_reader_done(struct pfn_reader *pfns) { return pfns->batch_start_index == pfns->last_index + 1; } static int pfn_reader_next(struct pfn_reader *pfns) { int rc; batch_clear(&pfns->batch); pfns->batch_start_index = pfns->batch_end_index; while (pfns->batch_end_index != pfns->last_index + 1) { unsigned int npfns = pfns->batch.total_pfns; if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && WARN_ON(interval_tree_double_span_iter_done(&pfns->span))) return -EINVAL; rc = pfn_reader_fill_span(pfns); if (rc) return rc; if (WARN_ON(!pfns->batch.total_pfns)) return -EINVAL; pfns->batch_end_index = pfns->batch_start_index + pfns->batch.total_pfns; if (pfns->batch_end_index == pfns->span.last_used + 1) interval_tree_double_span_iter_next(&pfns->span); /* Batch is full */ if (npfns == pfns->batch.total_pfns) return 0; } return 0; } static int pfn_reader_init(struct pfn_reader *pfns, struct iopt_pages *pages, unsigned long start_index, unsigned long last_index) { int rc; lockdep_assert_held(&pages->mutex); pfns->pages = pages; pfns->batch_start_index = start_index; pfns->batch_end_index = start_index; pfns->last_index = last_index; if (iopt_is_dmabuf(pages)) pfn_reader_dmabuf_init(&pfns->dmabuf, pages); else pfn_reader_user_init(&pfns->user, pages); rc = batch_init(&pfns->batch, last_index - start_index + 1); if (rc) return rc; interval_tree_double_span_iter_first(&pfns->span, &pages->access_itree, &pages->domains_itree, start_index, last_index); return 0; } /* * There are many assertions regarding the state of pages->npinned vs * pages->last_pinned, for instance something like unmapping a domain must only * decrement the npinned, and pfn_reader_destroy() must be called only after all * the pins are updated. This is fine for success flows, but error flows * sometimes need to release the pins held inside the pfn_reader before going on * to complete unmapping and releasing pins held in domains. */ static void pfn_reader_release_pins(struct pfn_reader *pfns) { struct iopt_pages *pages = pfns->pages; struct pfn_reader_user *user; if (iopt_is_dmabuf(pages)) return; user = &pfns->user; if (user->upages_end > pfns->batch_end_index) { /* Any pages not transferred to the batch are just unpinned */ unsigned long npages = user->upages_end - pfns->batch_end_index; unsigned long start_index = pfns->batch_end_index - user->upages_start; if (!user->file) { unpin_user_pages(user->upages + start_index, npages); } else { long n = user->ufolios_len / sizeof(*user->ufolios); unpin_folios(user->ufolios_next, user->ufolios + n - user->ufolios_next); } iopt_pages_sub_npinned(pages, npages); user->upages_end = pfns->batch_end_index; } if (pfns->batch_start_index != pfns->batch_end_index) { pfn_reader_unpin(pfns); pfns->batch_start_index = pfns->batch_end_index; } } static void pfn_reader_destroy(struct pfn_reader *pfns) { struct iopt_pages *pages = pfns->pages; pfn_reader_release_pins(pfns); if (!iopt_is_dmabuf(pfns->pages)) pfn_reader_user_destroy(&pfns->user, pfns->pages); batch_destroy(&pfns->batch, NULL); WARN_ON(pages->last_npinned != pages->npinned); } static int pfn_reader_first(struct pfn_reader *pfns, struct iopt_pages *pages, unsigned long start_index, unsigned long last_index) { int rc; if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && WARN_ON(last_index < start_index)) return -EINVAL; rc = pfn_reader_init(pfns, pages, start_index, last_index); if (rc) return rc; rc = pfn_reader_next(pfns); if (rc) { pfn_reader_destroy(pfns); return rc; } return 0; } static struct iopt_pages *iopt_alloc_pages(unsigned long start_byte, unsigned long length, bool writable) { struct iopt_pages *pages; /* * The iommu API uses size_t as the length, and protect the DIV_ROUND_UP * below from overflow */ if (length > SIZE_MAX - PAGE_SIZE || length == 0) return ERR_PTR(-EINVAL); pages = kzalloc(sizeof(*pages), GFP_KERNEL_ACCOUNT); if (!pages) return ERR_PTR(-ENOMEM); kref_init(&pages->kref); xa_init_flags(&pages->pinned_pfns, XA_FLAGS_ACCOUNT); mutex_init(&pages->mutex); pages->source_mm = current->mm; mmgrab(pages->source_mm); pages->npages = DIV_ROUND_UP(length + start_byte, PAGE_SIZE); pages->access_itree = RB_ROOT_CACHED; pages->domains_itree = RB_ROOT_CACHED; pages->writable = writable; if (capable(CAP_IPC_LOCK)) pages->account_mode = IOPT_PAGES_ACCOUNT_NONE; else pages->account_mode = IOPT_PAGES_ACCOUNT_USER; pages->source_task = current->group_leader; get_task_struct(current->group_leader); pages->source_user = get_uid(current_user()); return pages; } struct iopt_pages *iopt_alloc_user_pages(void __user *uptr, unsigned long length, bool writable) { struct iopt_pages *pages; unsigned long end; void __user *uptr_down = (void __user *)ALIGN_DOWN((uintptr_t)uptr, PAGE_SIZE); if (check_add_overflow((unsigned long)uptr, length, &end)) return ERR_PTR(-EOVERFLOW); pages = iopt_alloc_pages(uptr - uptr_down, length, writable); if (IS_ERR(pages)) return pages; pages->uptr = uptr_down; pages->type = IOPT_ADDRESS_USER; return pages; } struct iopt_pages *iopt_alloc_file_pages(struct file *file, unsigned long start_byte, unsigned long start, unsigned long length, bool writable) { struct iopt_pages *pages; pages = iopt_alloc_pages(start_byte, length, writable); if (IS_ERR(pages)) return pages; pages->file = get_file(file); pages->start = start - start_byte; pages->type = IOPT_ADDRESS_FILE; return pages; } static void iopt_revoke_notify(struct dma_buf_attachment *attach) { struct iopt_pages *pages = attach->importer_priv; struct iopt_pages_dmabuf_track *track; guard(mutex)(&pages->mutex); if (iopt_dmabuf_revoked(pages)) return; list_for_each_entry(track, &pages->dmabuf.tracker, elm) { struct iopt_area *area = track->area; iopt_area_unmap_domain_range(area, track->domain, iopt_area_index(area), iopt_area_last_index(area)); } pages->dmabuf.phys.len = 0; } static struct dma_buf_attach_ops iopt_dmabuf_attach_revoke_ops = { .allow_peer2peer = true, .move_notify = iopt_revoke_notify, }; /* * iommufd and vfio have a circular dependency. Future work for a phys * based private interconnect will remove this. */ static int sym_vfio_pci_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, struct dma_buf_phys_vec *phys) { typeof(&vfio_pci_dma_buf_iommufd_map) fn; int rc; rc = iommufd_test_dma_buf_iommufd_map(attachment, phys); if (rc != -EOPNOTSUPP) return rc; if (!IS_ENABLED(CONFIG_VFIO_PCI_DMABUF)) return -EOPNOTSUPP; fn = symbol_get(vfio_pci_dma_buf_iommufd_map); if (!fn) return -EOPNOTSUPP; rc = fn(attachment, phys); symbol_put(vfio_pci_dma_buf_iommufd_map); return rc; } static int iopt_map_dmabuf(struct iommufd_ctx *ictx, struct iopt_pages *pages, struct dma_buf *dmabuf) { struct dma_buf_attachment *attach; int rc; attach = dma_buf_dynamic_attach(dmabuf, iommufd_global_device(), &iopt_dmabuf_attach_revoke_ops, pages); if (IS_ERR(attach)) return PTR_ERR(attach); dma_resv_lock(dmabuf->resv, NULL); /* * Lock ordering requires the mutex to be taken inside the reservation, * make sure lockdep sees this. */ if (IS_ENABLED(CONFIG_LOCKDEP)) { mutex_lock(&pages->mutex); mutex_unlock(&pages->mutex); } rc = sym_vfio_pci_dma_buf_iommufd_map(attach, &pages->dmabuf.phys); if (rc) goto err_detach; dma_resv_unlock(dmabuf->resv); /* On success iopt_release_pages() will detach and put the dmabuf. */ pages->dmabuf.attach = attach; return 0; err_detach: dma_resv_unlock(dmabuf->resv); dma_buf_detach(dmabuf, attach); return rc; } struct iopt_pages *iopt_alloc_dmabuf_pages(struct iommufd_ctx *ictx, struct dma_buf *dmabuf, unsigned long start_byte, unsigned long start, unsigned long length, bool writable) { static struct lock_class_key pages_dmabuf_mutex_key; struct iopt_pages *pages; int rc; if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) return ERR_PTR(-EOPNOTSUPP); if (dmabuf->size <= (start + length - 1) || length / PAGE_SIZE >= MAX_NPFNS) return ERR_PTR(-EINVAL); pages = iopt_alloc_pages(start_byte, length, writable); if (IS_ERR(pages)) return pages; /* * The mmap_lock can be held when obtaining the dmabuf reservation lock * which creates a locking cycle with the pages mutex which is held * while obtaining the mmap_lock. This locking path is not present for * IOPT_ADDRESS_DMABUF so split the lock class. */ lockdep_set_class(&pages->mutex, &pages_dmabuf_mutex_key); /* dmabuf does not use pinned page accounting. */ pages->account_mode = IOPT_PAGES_ACCOUNT_NONE; pages->type = IOPT_ADDRESS_DMABUF; pages->dmabuf.start = start - start_byte; INIT_LIST_HEAD(&pages->dmabuf.tracker); rc = iopt_map_dmabuf(ictx, pages, dmabuf); if (rc) { iopt_put_pages(pages); return ERR_PTR(rc); } return pages; } int iopt_dmabuf_track_domain(struct iopt_pages *pages, struct iopt_area *area, struct iommu_domain *domain) { struct iopt_pages_dmabuf_track *track; lockdep_assert_held(&pages->mutex); if (WARN_ON(!iopt_is_dmabuf(pages))) return -EINVAL; list_for_each_entry(track, &pages->dmabuf.tracker, elm) if (WARN_ON(track->domain == domain && track->area == area)) return -EINVAL; track = kzalloc(sizeof(*track), GFP_KERNEL); if (!track) return -ENOMEM; track->domain = domain; track->area = area; list_add_tail(&track->elm, &pages->dmabuf.tracker); return 0; } void iopt_dmabuf_untrack_domain(struct iopt_pages *pages, struct iopt_area *area, struct iommu_domain *domain) { struct iopt_pages_dmabuf_track *track; lockdep_assert_held(&pages->mutex); WARN_ON(!iopt_is_dmabuf(pages)); list_for_each_entry(track, &pages->dmabuf.tracker, elm) { if (track->domain == domain && track->area == area) { list_del(&track->elm); kfree(track); return; } } WARN_ON(true); } int iopt_dmabuf_track_all_domains(struct iopt_area *area, struct iopt_pages *pages) { struct iopt_pages_dmabuf_track *track; struct iommu_domain *domain; unsigned long index; int rc; list_for_each_entry(track, &pages->dmabuf.tracker, elm) if (WARN_ON(track->area == area)) return -EINVAL; xa_for_each(&area->iopt->domains, index, domain) { rc = iopt_dmabuf_track_domain(pages, area, domain); if (rc) goto err_untrack; } return 0; err_untrack: iopt_dmabuf_untrack_all_domains(area, pages); return rc; } void iopt_dmabuf_untrack_all_domains(struct iopt_area *area, struct iopt_pages *pages) { struct iopt_pages_dmabuf_track *track; struct iopt_pages_dmabuf_track *tmp; list_for_each_entry_safe(track, tmp, &pages->dmabuf.tracker, elm) { if (track->area == area) { list_del(&track->elm); kfree(track); } } } void iopt_release_pages(struct kref *kref) { struct iopt_pages *pages = container_of(kref, struct iopt_pages, kref); WARN_ON(!RB_EMPTY_ROOT(&pages->access_itree.rb_root)); WARN_ON(!RB_EMPTY_ROOT(&pages->domains_itree.rb_root)); WARN_ON(pages->npinned); WARN_ON(!xa_empty(&pages->pinned_pfns)); mmdrop(pages->source_mm); mutex_destroy(&pages->mutex); put_task_struct(pages->source_task); free_uid(pages->source_user); if (iopt_is_dmabuf(pages) && pages->dmabuf.attach) { struct dma_buf *dmabuf = pages->dmabuf.attach->dmabuf; dma_buf_detach(dmabuf, pages->dmabuf.attach); dma_buf_put(dmabuf); WARN_ON(!list_empty(&pages->dmabuf.tracker)); } else if (pages->type == IOPT_ADDRESS_FILE) { fput(pages->file); } kfree(pages); } static void iopt_area_unpin_domain(struct pfn_batch *batch, struct iopt_area *area, struct iopt_pages *pages, struct iommu_domain *domain, unsigned long start_index, unsigned long last_index, unsigned long *unmapped_end_index, unsigned long real_last_index) { while (start_index <= last_index) { unsigned long batch_last_index; if (*unmapped_end_index <= last_index) { unsigned long start = max(start_index, *unmapped_end_index); if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && batch->total_pfns) WARN_ON(*unmapped_end_index - batch->total_pfns != start_index); batch_from_domain(batch, domain, area, start, last_index); batch_last_index = start_index + batch->total_pfns - 1; } else { batch_last_index = last_index; } if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) WARN_ON(batch_last_index > real_last_index); /* * unmaps must always 'cut' at a place where the pfns are not * contiguous to pair with the maps that always install * contiguous pages. Thus, if we have to stop unpinning in the * middle of the domains we need to keep reading pfns until we * find a cut point to do the unmap. The pfns we read are * carried over and either skipped or integrated into the next * batch. */ if (batch_last_index == last_index && last_index != real_last_index) batch_from_domain_continue(batch, domain, area, last_index + 1, real_last_index); if (*unmapped_end_index <= batch_last_index) { iopt_area_unmap_domain_range( area, domain, *unmapped_end_index, start_index + batch->total_pfns - 1); *unmapped_end_index = start_index + batch->total_pfns; } /* unpin must follow unmap */ batch_unpin(batch, pages, 0, batch_last_index - start_index + 1); start_index = batch_last_index + 1; batch_clear_carry(batch, *unmapped_end_index - batch_last_index - 1); } } static void __iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages, struct iommu_domain *domain, unsigned long last_index) { struct interval_tree_double_span_iter span; unsigned long start_index = iopt_area_index(area); unsigned long unmapped_end_index = start_index; u64 backup[BATCH_BACKUP_SIZE]; struct pfn_batch batch; lockdep_assert_held(&pages->mutex); if (iopt_is_dmabuf(pages)) { if (WARN_ON(iopt_dmabuf_revoked(pages))) return; iopt_area_unmap_domain_range(area, domain, start_index, last_index); return; } /* * For security we must not unpin something that is still DMA mapped, * so this must unmap any IOVA before we go ahead and unpin the pages. * This creates a complexity where we need to skip over unpinning pages * held in the xarray, but continue to unmap from the domain. * * The domain unmap cannot stop in the middle of a contiguous range of * PFNs. To solve this problem the unpinning step will read ahead to the * end of any contiguous span, unmap that whole span, and then only * unpin the leading part that does not have any accesses. The residual * PFNs that were unmapped but not unpinned are called a "carry" in the * batch as they are moved to the front of the PFN list and continue on * to the next iteration(s). */ batch_init_backup(&batch, last_index + 1, backup, sizeof(backup)); interval_tree_for_each_double_span(&span, &pages->domains_itree, &pages->access_itree, start_index, last_index) { if (span.is_used) { batch_skip_carry(&batch, span.last_used - span.start_used + 1); continue; } iopt_area_unpin_domain(&batch, area, pages, domain, span.start_hole, span.last_hole, &unmapped_end_index, last_index); } /* * If the range ends in a access then we do the residual unmap without * any unpins. */ if (unmapped_end_index != last_index + 1) iopt_area_unmap_domain_range(area, domain, unmapped_end_index, last_index); WARN_ON(batch.total_pfns); batch_destroy(&batch, backup); update_unpinned(pages); } static void iopt_area_unfill_partial_domain(struct iopt_area *area, struct iopt_pages *pages, struct iommu_domain *domain, unsigned long end_index) { if (end_index != iopt_area_index(area)) __iopt_area_unfill_domain(area, pages, domain, end_index - 1); } /** * iopt_area_unmap_domain() - Unmap without unpinning PFNs in a domain * @area: The IOVA range to unmap * @domain: The domain to unmap * * The caller must know that unpinning is not required, usually because there * are other domains in the iopt. */ void iopt_area_unmap_domain(struct iopt_area *area, struct iommu_domain *domain) { iommu_unmap_nofail(domain, iopt_area_iova(area), iopt_area_length(area)); } /** * iopt_area_unfill_domain() - Unmap and unpin PFNs in a domain * @area: IOVA area to use * @pages: page supplier for the area (area->pages is NULL) * @domain: Domain to unmap from * * The domain should be removed from the domains_itree before calling. The * domain will always be unmapped, but the PFNs may not be unpinned if there are * still accesses. */ void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages, struct iommu_domain *domain) { if (iopt_dmabuf_revoked(pages)) return; __iopt_area_unfill_domain(area, pages, domain, iopt_area_last_index(area)); } /** * iopt_area_fill_domain() - Map PFNs from the area into a domain * @area: IOVA area to use * @domain: Domain to load PFNs into * * Read the pfns from the area's underlying iopt_pages and map them into the * given domain. Called when attaching a new domain to an io_pagetable. */ int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain) { unsigned long done_end_index; struct pfn_reader pfns; int rc; lockdep_assert_held(&area->pages->mutex); if (iopt_dmabuf_revoked(area->pages)) return 0; rc = pfn_reader_first(&pfns, area->pages, iopt_area_index(area), iopt_area_last_index(area)); if (rc) return rc; while (!pfn_reader_done(&pfns)) { done_end_index = pfns.batch_start_index; rc = batch_to_domain(&pfns.batch, domain, area, pfns.batch_start_index); if (rc) goto out_unmap; done_end_index = pfns.batch_end_index; rc = pfn_reader_next(&pfns); if (rc) goto out_unmap; } rc = pfn_reader_update_pinned(&pfns); if (rc) goto out_unmap; goto out_destroy; out_unmap: pfn_reader_release_pins(&pfns); iopt_area_unfill_partial_domain(area, area->pages, domain, done_end_index); out_destroy: pfn_reader_destroy(&pfns); return rc; } /** * iopt_area_fill_domains() - Install PFNs into the area's domains * @area: The area to act on * @pages: The pages associated with the area (area->pages is NULL) * * Called during area creation. The area is freshly created and not inserted in * the domains_itree yet. PFNs are read and loaded into every domain held in the * area's io_pagetable and the area is installed in the domains_itree. * * On failure all domains are left unchanged. */ int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages) { unsigned long done_first_end_index; unsigned long done_all_end_index; struct iommu_domain *domain; unsigned long unmap_index; struct pfn_reader pfns; unsigned long index; int rc; lockdep_assert_held(&area->iopt->domains_rwsem); if (xa_empty(&area->iopt->domains)) return 0; mutex_lock(&pages->mutex); if (iopt_is_dmabuf(pages)) { rc = iopt_dmabuf_track_all_domains(area, pages); if (rc) goto out_unlock; } if (!iopt_dmabuf_revoked(pages)) { rc = pfn_reader_first(&pfns, pages, iopt_area_index(area), iopt_area_last_index(area)); if (rc) goto out_untrack; while (!pfn_reader_done(&pfns)) { done_first_end_index = pfns.batch_end_index; done_all_end_index = pfns.batch_start_index; xa_for_each(&area->iopt->domains, index, domain) { rc = batch_to_domain(&pfns.batch, domain, area, pfns.batch_start_index); if (rc) goto out_unmap; } done_all_end_index = done_first_end_index; rc = pfn_reader_next(&pfns); if (rc) goto out_unmap; } rc = pfn_reader_update_pinned(&pfns); if (rc) goto out_unmap; pfn_reader_destroy(&pfns); } area->storage_domain = xa_load(&area->iopt->domains, 0); interval_tree_insert(&area->pages_node, &pages->domains_itree); mutex_unlock(&pages->mutex); return 0; out_unmap: pfn_reader_release_pins(&pfns); xa_for_each(&area->iopt->domains, unmap_index, domain) { unsigned long end_index; if (unmap_index < index) end_index = done_first_end_index; else end_index = done_all_end_index; /* * The area is not yet part of the domains_itree so we have to * manage the unpinning specially. The last domain does the * unpin, every other domain is just unmapped. */ if (unmap_index != area->iopt->next_domain_id - 1) { if (end_index != iopt_area_index(area)) iopt_area_unmap_domain_range( area, domain, iopt_area_index(area), end_index - 1); } else { iopt_area_unfill_partial_domain(area, pages, domain, end_index); } } pfn_reader_destroy(&pfns); out_untrack: if (iopt_is_dmabuf(pages)) iopt_dmabuf_untrack_all_domains(area, pages); out_unlock: mutex_unlock(&pages->mutex); return rc; } /** * iopt_area_unfill_domains() - unmap PFNs from the area's domains * @area: The area to act on * @pages: The pages associated with the area (area->pages is NULL) * * Called during area destruction. This unmaps the iova's covered by all the * area's domains and releases the PFNs. */ void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages) { struct io_pagetable *iopt = area->iopt; struct iommu_domain *domain; unsigned long index; lockdep_assert_held(&iopt->domains_rwsem); mutex_lock(&pages->mutex); if (!area->storage_domain) goto out_unlock; xa_for_each(&iopt->domains, index, domain) { if (domain == area->storage_domain) continue; if (!iopt_dmabuf_revoked(pages)) iopt_area_unmap_domain_range( area, domain, iopt_area_index(area), iopt_area_last_index(area)); } if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) WARN_ON(RB_EMPTY_NODE(&area->pages_node.rb)); interval_tree_remove(&area->pages_node, &pages->domains_itree); iopt_area_unfill_domain(area, pages, area->storage_domain); if (iopt_is_dmabuf(pages)) iopt_dmabuf_untrack_all_domains(area, pages); area->storage_domain = NULL; out_unlock: mutex_unlock(&pages->mutex); } static void iopt_pages_unpin_xarray(struct pfn_batch *batch, struct iopt_pages *pages, unsigned long start_index, unsigned long end_index) { while (start_index <= end_index) { batch_from_xarray_clear(batch, &pages->pinned_pfns, start_index, end_index); batch_unpin(batch, pages, 0, batch->total_pfns); start_index += batch->total_pfns; batch_clear(batch); } } /** * iopt_pages_unfill_xarray() - Update the xarry after removing an access * @pages: The pages to act on * @start_index: Starting PFN index * @last_index: Last PFN index * * Called when an iopt_pages_access is removed, removes pages from the itree. * The access should already be removed from the access_itree. */ void iopt_pages_unfill_xarray(struct iopt_pages *pages, unsigned long start_index, unsigned long last_index) { struct interval_tree_double_span_iter span; u64 backup[BATCH_BACKUP_SIZE]; struct pfn_batch batch; bool batch_inited = false; lockdep_assert_held(&pages->mutex); interval_tree_for_each_double_span(&span, &pages->access_itree, &pages->domains_itree, start_index, last_index) { if (!span.is_used) { if (!batch_inited) { batch_init_backup(&batch, last_index - start_index + 1, backup, sizeof(backup)); batch_inited = true; } iopt_pages_unpin_xarray(&batch, pages, span.start_hole, span.last_hole); } else if (span.is_used == 2) { /* Covered by a domain */ clear_xarray(&pages->pinned_pfns, span.start_used, span.last_used); } /* Otherwise covered by an existing access */ } if (batch_inited) batch_destroy(&batch, backup); update_unpinned(pages); } /** * iopt_pages_fill_from_xarray() - Fast path for reading PFNs * @pages: The pages to act on * @start_index: The first page index in the range * @last_index: The last page index in the range * @out_pages: The output array to return the pages * * This can be called if the caller is holding a refcount on an * iopt_pages_access that is known to have already been filled. It quickly reads * the pages directly from the xarray. * * This is part of the SW iommu interface to read pages for in-kernel use. */ void iopt_pages_fill_from_xarray(struct iopt_pages *pages, unsigned long start_index, unsigned long last_index, struct page **out_pages) { XA_STATE(xas, &pages->pinned_pfns, start_index); void *entry; rcu_read_lock(); while (start_index <= last_index) { entry = xas_next(&xas); if (xas_retry(&xas, entry)) continue; WARN_ON(!xa_is_value(entry)); *(out_pages++) = pfn_to_page(xa_to_value(entry)); start_index++; } rcu_read_unlock(); } static int iopt_pages_fill_from_domain(struct iopt_pages *pages, unsigned long start_index, unsigned long last_index, struct page **out_pages) { while (start_index != last_index + 1) { unsigned long domain_last; struct iopt_area *area; area = iopt_pages_find_domain_area(pages, start_index); if (WARN_ON(!area)) return -EINVAL; domain_last = min(iopt_area_last_index(area), last_index); out_pages = raw_pages_from_domain(area->storage_domain, area, start_index, domain_last, out_pages); start_index = domain_last + 1; } return 0; } static int iopt_pages_fill(struct iopt_pages *pages, struct pfn_reader_user *user, unsigned long start_index, unsigned long last_index, struct page **out_pages) { unsigned long cur_index = start_index; int rc; while (cur_index != last_index + 1) { user->upages = out_pages + (cur_index - start_index); rc = pfn_reader_user_pin(user, pages, cur_index, last_index); if (rc) goto out_unpin; cur_index = user->upages_end; } return 0; out_unpin: if (start_index != cur_index) iopt_pages_err_unpin(pages, start_index, cur_index - 1, out_pages); return rc; } /** * iopt_pages_fill_xarray() - Read PFNs * @pages: The pages to act on * @start_index: The first page index in the range * @last_index: The last page index in the range * @out_pages: The output array to return the pages, may be NULL * * This populates the xarray and returns the pages in out_pages. As the slow * path this is able to copy pages from other storage tiers into the xarray. * * On failure the xarray is left unchanged. * * This is part of the SW iommu interface to read pages for in-kernel use. */ int iopt_pages_fill_xarray(struct iopt_pages *pages, unsigned long start_index, unsigned long last_index, struct page **out_pages) { struct interval_tree_double_span_iter span; unsigned long xa_end = start_index; struct pfn_reader_user user; int rc; lockdep_assert_held(&pages->mutex); pfn_reader_user_init(&user, pages); user.upages_len = (last_index - start_index + 1) * sizeof(*out_pages); interval_tree_for_each_double_span(&span, &pages->access_itree, &pages->domains_itree, start_index, last_index) { struct page **cur_pages; if (span.is_used == 1) { cur_pages = out_pages + (span.start_used - start_index); iopt_pages_fill_from_xarray(pages, span.start_used, span.last_used, cur_pages); continue; } if (span.is_used == 2) { cur_pages = out_pages + (span.start_used - start_index); iopt_pages_fill_from_domain(pages, span.start_used, span.last_used, cur_pages); rc = pages_to_xarray(&pages->pinned_pfns, span.start_used, span.last_used, cur_pages); if (rc) goto out_clean_xa; xa_end = span.last_used + 1; continue; } /* hole */ cur_pages = out_pages + (span.start_hole - start_index); rc = iopt_pages_fill(pages, &user, span.start_hole, span.last_hole, cur_pages); if (rc) goto out_clean_xa; rc = pages_to_xarray(&pages->pinned_pfns, span.start_hole, span.last_hole, cur_pages); if (rc) { iopt_pages_err_unpin(pages, span.start_hole, span.last_hole, cur_pages); goto out_clean_xa; } xa_end = span.last_hole + 1; } rc = pfn_reader_user_update_pinned(&user, pages); if (rc) goto out_clean_xa; user.upages = NULL; pfn_reader_user_destroy(&user, pages); return 0; out_clean_xa: if (start_index != xa_end) iopt_pages_unfill_xarray(pages, start_index, xa_end - 1); user.upages = NULL; pfn_reader_user_destroy(&user, pages); return rc; } /* * This uses the pfn_reader instead of taking a shortcut by using the mm. It can * do every scenario and is fully consistent with what an iommu_domain would * see. */ static int iopt_pages_rw_slow(struct iopt_pages *pages, unsigned long start_index, unsigned long last_index, unsigned long offset, void *data, unsigned long length, unsigned int flags) { struct pfn_reader pfns; int rc; mutex_lock(&pages->mutex); rc = pfn_reader_first(&pfns, pages, start_index, last_index); if (rc) goto out_unlock; while (!pfn_reader_done(&pfns)) { unsigned long done; done = batch_rw(&pfns.batch, data, offset, length, flags); data += done; length -= done; offset = 0; pfn_reader_unpin(&pfns); rc = pfn_reader_next(&pfns); if (rc) goto out_destroy; } if (WARN_ON(length != 0)) rc = -EINVAL; out_destroy: pfn_reader_destroy(&pfns); out_unlock: mutex_unlock(&pages->mutex); return rc; } /* * A medium speed path that still allows DMA inconsistencies, but doesn't do any * memory allocations or interval tree searches. */ static int iopt_pages_rw_page(struct iopt_pages *pages, unsigned long index, unsigned long offset, void *data, unsigned long length, unsigned int flags) { struct page *page = NULL; int rc; if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && WARN_ON(pages->type != IOPT_ADDRESS_USER)) return -EINVAL; if (!mmget_not_zero(pages->source_mm)) return iopt_pages_rw_slow(pages, index, index, offset, data, length, flags); if (iommufd_should_fail()) { rc = -EINVAL; goto out_mmput; } mmap_read_lock(pages->source_mm); rc = pin_user_pages_remote( pages->source_mm, (uintptr_t)(pages->uptr + index * PAGE_SIZE), 1, (flags & IOMMUFD_ACCESS_RW_WRITE) ? FOLL_WRITE : 0, &page, NULL); mmap_read_unlock(pages->source_mm); if (rc != 1) { if (WARN_ON(rc >= 0)) rc = -EINVAL; goto out_mmput; } copy_data_page(page, data, offset, length, flags); unpin_user_page(page); rc = 0; out_mmput: mmput(pages->source_mm); return rc; } /** * iopt_pages_rw_access - Copy to/from a linear slice of the pages * @pages: pages to act on * @start_byte: First byte of pages to copy to/from * @data: Kernel buffer to get/put the data * @length: Number of bytes to copy * @flags: IOMMUFD_ACCESS_RW_* flags * * This will find each page in the range, kmap it and then memcpy to/from * the given kernel buffer. */ int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte, void *data, unsigned long length, unsigned int flags) { unsigned long start_index = start_byte / PAGE_SIZE; unsigned long last_index = (start_byte + length - 1) / PAGE_SIZE; bool change_mm = current->mm != pages->source_mm; int rc = 0; if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && (flags & __IOMMUFD_ACCESS_RW_SLOW_PATH)) change_mm = true; if ((flags & IOMMUFD_ACCESS_RW_WRITE) && !pages->writable) return -EPERM; if (iopt_is_dmabuf(pages)) return -EINVAL; if (pages->type != IOPT_ADDRESS_USER) return iopt_pages_rw_slow(pages, start_index, last_index, start_byte % PAGE_SIZE, data, length, flags); if (!(flags & IOMMUFD_ACCESS_RW_KTHREAD) && change_mm) { if (start_index == last_index) return iopt_pages_rw_page(pages, start_index, start_byte % PAGE_SIZE, data, length, flags); return iopt_pages_rw_slow(pages, start_index, last_index, start_byte % PAGE_SIZE, data, length, flags); } /* * Try to copy using copy_to_user(). We do this as a fast path and * ignore any pinning inconsistencies, unlike a real DMA path. */ if (change_mm) { if (!mmget_not_zero(pages->source_mm)) return iopt_pages_rw_slow(pages, start_index, last_index, start_byte % PAGE_SIZE, data, length, flags); kthread_use_mm(pages->source_mm); } if (flags & IOMMUFD_ACCESS_RW_WRITE) { if (copy_to_user(pages->uptr + start_byte, data, length)) rc = -EFAULT; } else { if (copy_from_user(data, pages->uptr + start_byte, length)) rc = -EFAULT; } if (change_mm) { kthread_unuse_mm(pages->source_mm); mmput(pages->source_mm); } return rc; } static struct iopt_pages_access * iopt_pages_get_exact_access(struct iopt_pages *pages, unsigned long index, unsigned long last) { struct interval_tree_node *node; lockdep_assert_held(&pages->mutex); /* There can be overlapping ranges in this interval tree */ for (node = interval_tree_iter_first(&pages->access_itree, index, last); node; node = interval_tree_iter_next(node, index, last)) if (node->start == index && node->last == last) return container_of(node, struct iopt_pages_access, node); return NULL; } /** * iopt_area_add_access() - Record an in-knerel access for PFNs * @area: The source of PFNs * @start_index: First page index * @last_index: Inclusive last page index * @out_pages: Output list of struct page's representing the PFNs * @flags: IOMMUFD_ACCESS_RW_* flags * @lock_area: Fail userspace munmap on this area * * Record that an in-kernel access will be accessing the pages, ensure they are * pinned, and return the PFNs as a simple list of 'struct page *'. * * This should be undone through a matching call to iopt_area_remove_access() */ int iopt_area_add_access(struct iopt_area *area, unsigned long start_index, unsigned long last_index, struct page **out_pages, unsigned int flags, bool lock_area) { struct iopt_pages *pages = area->pages; struct iopt_pages_access *access; int rc; if ((flags & IOMMUFD_ACCESS_RW_WRITE) && !pages->writable) return -EPERM; mutex_lock(&pages->mutex); access = iopt_pages_get_exact_access(pages, start_index, last_index); if (access) { area->num_accesses++; if (lock_area) area->num_locks++; access->users++; iopt_pages_fill_from_xarray(pages, start_index, last_index, out_pages); mutex_unlock(&pages->mutex); return 0; } access = kzalloc(sizeof(*access), GFP_KERNEL_ACCOUNT); if (!access) { rc = -ENOMEM; goto err_unlock; } rc = iopt_pages_fill_xarray(pages, start_index, last_index, out_pages); if (rc) goto err_free; access->node.start = start_index; access->node.last = last_index; access->users = 1; area->num_accesses++; if (lock_area) area->num_locks++; interval_tree_insert(&access->node, &pages->access_itree); mutex_unlock(&pages->mutex); return 0; err_free: kfree(access); err_unlock: mutex_unlock(&pages->mutex); return rc; } /** * iopt_area_remove_access() - Release an in-kernel access for PFNs * @area: The source of PFNs * @start_index: First page index * @last_index: Inclusive last page index * @unlock_area: Must match the matching iopt_area_add_access()'s lock_area * * Undo iopt_area_add_access() and unpin the pages if necessary. The caller * must stop using the PFNs before calling this. */ void iopt_area_remove_access(struct iopt_area *area, unsigned long start_index, unsigned long last_index, bool unlock_area) { struct iopt_pages *pages = area->pages; struct iopt_pages_access *access; mutex_lock(&pages->mutex); access = iopt_pages_get_exact_access(pages, start_index, last_index); if (WARN_ON(!access)) goto out_unlock; WARN_ON(area->num_accesses == 0 || access->users == 0); if (unlock_area) { WARN_ON(area->num_locks == 0); area->num_locks--; } area->num_accesses--; access->users--; if (access->users) goto out_unlock; interval_tree_remove(&access->node, &pages->access_itree); iopt_pages_unfill_xarray(pages, start_index, last_index); kfree(access); out_unlock: mutex_unlock(&pages->mutex); } |
| 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 | // SPDX-License-Identifier: GPL-2.0+ /* * Driver for SCM Microsystems (a.k.a. Shuttle) USB-ATAPI cable * * Current development and maintenance by: * (c) 2000, 2001 Robert Baruch (autophile@starband.net) * (c) 2004, 2005 Daniel Drake <dsd@gentoo.org> * * Developed with the assistance of: * (c) 2002 Alan Stern <stern@rowland.org> * * Flash support based on earlier work by: * (c) 2002 Thomas Kreiling <usbdev@sm04.de> * * Many originally ATAPI devices were slightly modified to meet the USB * market by using some kind of translation from ATAPI to USB on the host, * and the peripheral would translate from USB back to ATAPI. * * SCM Microsystems (www.scmmicro.com) makes a device, sold to OEM's only, * which does the USB-to-ATAPI conversion. By obtaining the data sheet on * their device under nondisclosure agreement, I have been able to write * this driver for Linux. * * The chip used in the device can also be used for EPP and ISA translation * as well. This driver is only guaranteed to work with the ATAPI * translation. * * See the Kconfig help text for a list of devices known to be supported by * this driver. */ #include <linux/errno.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/string_choices.h> #include <linux/cdrom.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include "usb.h" #include "transport.h" #include "protocol.h" #include "debug.h" #include "scsiglue.h" #define DRV_NAME "ums-usbat" MODULE_DESCRIPTION("Driver for SCM Microsystems (a.k.a. Shuttle) USB-ATAPI cable"); MODULE_AUTHOR("Daniel Drake <dsd@gentoo.org>, Robert Baruch <autophile@starband.net>"); MODULE_LICENSE("GPL"); MODULE_IMPORT_NS("USB_STORAGE"); /* Supported device types */ #define USBAT_DEV_HP8200 0x01 #define USBAT_DEV_FLASH 0x02 #define USBAT_EPP_PORT 0x10 #define USBAT_EPP_REGISTER 0x30 #define USBAT_ATA 0x40 #define USBAT_ISA 0x50 /* Commands (need to be logically OR'd with an access type */ #define USBAT_CMD_READ_REG 0x00 #define USBAT_CMD_WRITE_REG 0x01 #define USBAT_CMD_READ_BLOCK 0x02 #define USBAT_CMD_WRITE_BLOCK 0x03 #define USBAT_CMD_COND_READ_BLOCK 0x04 #define USBAT_CMD_COND_WRITE_BLOCK 0x05 #define USBAT_CMD_WRITE_REGS 0x07 /* Commands (these don't need an access type) */ #define USBAT_CMD_EXEC_CMD 0x80 #define USBAT_CMD_SET_FEAT 0x81 #define USBAT_CMD_UIO 0x82 /* Methods of accessing UIO register */ #define USBAT_UIO_READ 1 #define USBAT_UIO_WRITE 0 /* Qualifier bits */ #define USBAT_QUAL_FCQ 0x20 /* full compare */ #define USBAT_QUAL_ALQ 0x10 /* auto load subcount */ /* USBAT Flash Media status types */ #define USBAT_FLASH_MEDIA_NONE 0 #define USBAT_FLASH_MEDIA_CF 1 /* USBAT Flash Media change types */ #define USBAT_FLASH_MEDIA_SAME 0 #define USBAT_FLASH_MEDIA_CHANGED 1 /* USBAT ATA registers */ #define USBAT_ATA_DATA 0x10 /* read/write data (R/W) */ #define USBAT_ATA_FEATURES 0x11 /* set features (W) */ #define USBAT_ATA_ERROR 0x11 /* error (R) */ #define USBAT_ATA_SECCNT 0x12 /* sector count (R/W) */ #define USBAT_ATA_SECNUM 0x13 /* sector number (R/W) */ #define USBAT_ATA_LBA_ME 0x14 /* cylinder low (R/W) */ #define USBAT_ATA_LBA_HI 0x15 /* cylinder high (R/W) */ #define USBAT_ATA_DEVICE 0x16 /* head/device selection (R/W) */ #define USBAT_ATA_STATUS 0x17 /* device status (R) */ #define USBAT_ATA_CMD 0x17 /* device command (W) */ #define USBAT_ATA_ALTSTATUS 0x0E /* status (no clear IRQ) (R) */ /* USBAT User I/O Data registers */ #define USBAT_UIO_EPAD 0x80 /* Enable Peripheral Control Signals */ #define USBAT_UIO_CDT 0x40 /* Card Detect (Read Only) */ /* CDT = ACKD & !UI1 & !UI0 */ #define USBAT_UIO_1 0x20 /* I/O 1 */ #define USBAT_UIO_0 0x10 /* I/O 0 */ #define USBAT_UIO_EPP_ATA 0x08 /* 1=EPP mode, 0=ATA mode */ #define USBAT_UIO_UI1 0x04 /* Input 1 */ #define USBAT_UIO_UI0 0x02 /* Input 0 */ #define USBAT_UIO_INTR_ACK 0x01 /* Interrupt (ATA/ISA)/Acknowledge (EPP) */ /* USBAT User I/O Enable registers */ #define USBAT_UIO_DRVRST 0x80 /* Reset Peripheral */ #define USBAT_UIO_ACKD 0x40 /* Enable Card Detect */ #define USBAT_UIO_OE1 0x20 /* I/O 1 set=output/clr=input */ /* If ACKD=1, set OE1 to 1 also. */ #define USBAT_UIO_OE0 0x10 /* I/O 0 set=output/clr=input */ #define USBAT_UIO_ADPRST 0x01 /* Reset SCM chip */ /* USBAT Features */ #define USBAT_FEAT_ETEN 0x80 /* External trigger enable */ #define USBAT_FEAT_U1 0x08 #define USBAT_FEAT_U0 0x04 #define USBAT_FEAT_ET1 0x02 #define USBAT_FEAT_ET2 0x01 struct usbat_info { int devicetype; /* Used for Flash readers only */ unsigned long sectors; /* total sector count */ unsigned long ssize; /* sector size in bytes */ unsigned char sense_key; unsigned long sense_asc; /* additional sense code */ unsigned long sense_ascq; /* additional sense code qualifier */ }; #define short_pack(LSB,MSB) ( ((u16)(LSB)) | ( ((u16)(MSB))<<8 ) ) #define LSB_of(s) ((s)&0xFF) #define MSB_of(s) ((s)>>8) static int transferred = 0; static int usbat_flash_transport(struct scsi_cmnd * srb, struct us_data *us); static int usbat_hp8200e_transport(struct scsi_cmnd *srb, struct us_data *us); static int init_usbat_cd(struct us_data *us); static int init_usbat_flash(struct us_data *us); /* * The table of devices */ #define UNUSUAL_DEV(id_vendor, id_product, bcdDeviceMin, bcdDeviceMax, \ vendorName, productName, useProtocol, useTransport, \ initFunction, flags) \ { USB_DEVICE_VER(id_vendor, id_product, bcdDeviceMin, bcdDeviceMax), \ .driver_info = (flags) } static const struct usb_device_id usbat_usb_ids[] = { # include "unusual_usbat.h" { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, usbat_usb_ids); #undef UNUSUAL_DEV /* * The flags table */ #define UNUSUAL_DEV(idVendor, idProduct, bcdDeviceMin, bcdDeviceMax, \ vendor_name, product_name, use_protocol, use_transport, \ init_function, Flags) \ { \ .vendorName = vendor_name, \ .productName = product_name, \ .useProtocol = use_protocol, \ .useTransport = use_transport, \ .initFunction = init_function, \ } static const struct us_unusual_dev usbat_unusual_dev_list[] = { # include "unusual_usbat.h" { } /* Terminating entry */ }; #undef UNUSUAL_DEV /* * Convenience function to produce an ATA read/write sectors command * Use cmd=0x20 for read, cmd=0x30 for write */ static void usbat_pack_ata_sector_cmd(unsigned char *buf, unsigned char thistime, u32 sector, unsigned char cmd) { buf[0] = 0; buf[1] = thistime; buf[2] = sector & 0xFF; buf[3] = (sector >> 8) & 0xFF; buf[4] = (sector >> 16) & 0xFF; buf[5] = 0xE0 | ((sector >> 24) & 0x0F); buf[6] = cmd; } /* * Convenience function to get the device type (flash or hp8200) */ static int usbat_get_device_type(struct us_data *us) { return ((struct usbat_info*)us->extra)->devicetype; } /* * Read a register from the device */ static int usbat_read(struct us_data *us, unsigned char access, unsigned char reg, unsigned char *content) { return usb_stor_ctrl_transfer(us, us->recv_ctrl_pipe, access | USBAT_CMD_READ_REG, 0xC0, (u16)reg, 0, content, 1); } /* * Write to a register on the device */ static int usbat_write(struct us_data *us, unsigned char access, unsigned char reg, unsigned char content) { return usb_stor_ctrl_transfer(us, us->send_ctrl_pipe, access | USBAT_CMD_WRITE_REG, 0x40, short_pack(reg, content), 0, NULL, 0); } /* * Convenience function to perform a bulk read */ static int usbat_bulk_read(struct us_data *us, void* buf, unsigned int len, int use_sg) { if (len == 0) return USB_STOR_XFER_GOOD; usb_stor_dbg(us, "len = %d\n", len); return usb_stor_bulk_transfer_sg(us, us->recv_bulk_pipe, buf, len, use_sg, NULL); } /* * Convenience function to perform a bulk write */ static int usbat_bulk_write(struct us_data *us, void* buf, unsigned int len, int use_sg) { if (len == 0) return USB_STOR_XFER_GOOD; usb_stor_dbg(us, "len = %d\n", len); return usb_stor_bulk_transfer_sg(us, us->send_bulk_pipe, buf, len, use_sg, NULL); } /* * Some USBAT-specific commands can only be executed over a command transport * This transport allows one (len=8) or two (len=16) vendor-specific commands * to be executed. */ static int usbat_execute_command(struct us_data *us, unsigned char *commands, unsigned int len) { return usb_stor_ctrl_transfer(us, us->send_ctrl_pipe, USBAT_CMD_EXEC_CMD, 0x40, 0, 0, commands, len); } /* * Read the status register */ static int usbat_get_status(struct us_data *us, unsigned char *status) { int rc; rc = usbat_read(us, USBAT_ATA, USBAT_ATA_STATUS, status); usb_stor_dbg(us, "0x%02X\n", *status); return rc; } /* * Check the device status */ static int usbat_check_status(struct us_data *us) { unsigned char *reply = us->iobuf; int rc; rc = usbat_get_status(us, reply); if (rc != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_FAILED; /* error/check condition (0x51 is ok) */ if (*reply & 0x01 && *reply != 0x51) return USB_STOR_TRANSPORT_FAILED; /* device fault */ if (*reply & 0x20) return USB_STOR_TRANSPORT_FAILED; return USB_STOR_TRANSPORT_GOOD; } /* * Stores critical information in internal registers in preparation for the execution * of a conditional usbat_read_blocks or usbat_write_blocks call. */ static int usbat_set_shuttle_features(struct us_data *us, unsigned char external_trigger, unsigned char epp_control, unsigned char mask_byte, unsigned char test_pattern, unsigned char subcountH, unsigned char subcountL) { unsigned char *command = us->iobuf; command[0] = 0x40; command[1] = USBAT_CMD_SET_FEAT; /* * The only bit relevant to ATA access is bit 6 * which defines 8 bit data access (set) or 16 bit (unset) */ command[2] = epp_control; /* * If FCQ is set in the qualifier (defined in R/W cmd), then bits U0, U1, * ET1 and ET2 define an external event to be checked for on event of a * _read_blocks or _write_blocks operation. The read/write will not take * place unless the defined trigger signal is active. */ command[3] = external_trigger; /* * The resultant byte of the mask operation (see mask_byte) is compared for * equivalence with this test pattern. If equal, the read/write will take * place. */ command[4] = test_pattern; /* * This value is logically ANDed with the status register field specified * in the read/write command. */ command[5] = mask_byte; /* * If ALQ is set in the qualifier, this field contains the address of the * registers where the byte count should be read for transferring the data. * If ALQ is not set, then this field contains the number of bytes to be * transferred. */ command[6] = subcountL; command[7] = subcountH; return usbat_execute_command(us, command, 8); } /* * Block, waiting for an ATA device to become not busy or to report * an error condition. */ static int usbat_wait_not_busy(struct us_data *us, int minutes) { int i; int result; unsigned char *status = us->iobuf; /* * Synchronizing cache on a CDR could take a heck of a long time, * but probably not more than 10 minutes or so. On the other hand, * doing a full blank on a CDRW at speed 1 will take about 75 * minutes! */ for (i=0; i<1200+minutes*60; i++) { result = usbat_get_status(us, status); if (result!=USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; if (*status & 0x01) { /* check condition */ result = usbat_read(us, USBAT_ATA, 0x10, status); return USB_STOR_TRANSPORT_FAILED; } if (*status & 0x20) /* device fault */ return USB_STOR_TRANSPORT_FAILED; if ((*status & 0x80)==0x00) { /* not busy */ usb_stor_dbg(us, "Waited not busy for %d steps\n", i); return USB_STOR_TRANSPORT_GOOD; } if (i<500) msleep(10); /* 5 seconds */ else if (i<700) msleep(50); /* 10 seconds */ else if (i<1200) msleep(100); /* 50 seconds */ else msleep(1000); /* X minutes */ } usb_stor_dbg(us, "Waited not busy for %d minutes, timing out\n", minutes); return USB_STOR_TRANSPORT_FAILED; } /* * Read block data from the data register */ static int usbat_read_block(struct us_data *us, void* buf, unsigned short len, int use_sg) { int result; unsigned char *command = us->iobuf; if (!len) return USB_STOR_TRANSPORT_GOOD; command[0] = 0xC0; command[1] = USBAT_ATA | USBAT_CMD_READ_BLOCK; command[2] = USBAT_ATA_DATA; command[3] = 0; command[4] = 0; command[5] = 0; command[6] = LSB_of(len); command[7] = MSB_of(len); result = usbat_execute_command(us, command, 8); if (result != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; result = usbat_bulk_read(us, buf, len, use_sg); return (result == USB_STOR_XFER_GOOD ? USB_STOR_TRANSPORT_GOOD : USB_STOR_TRANSPORT_ERROR); } /* * Write block data via the data register */ static int usbat_write_block(struct us_data *us, unsigned char access, void* buf, unsigned short len, int minutes, int use_sg) { int result; unsigned char *command = us->iobuf; if (!len) return USB_STOR_TRANSPORT_GOOD; command[0] = 0x40; command[1] = access | USBAT_CMD_WRITE_BLOCK; command[2] = USBAT_ATA_DATA; command[3] = 0; command[4] = 0; command[5] = 0; command[6] = LSB_of(len); command[7] = MSB_of(len); result = usbat_execute_command(us, command, 8); if (result != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; result = usbat_bulk_write(us, buf, len, use_sg); if (result != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; return usbat_wait_not_busy(us, minutes); } /* * Process read and write requests */ static int usbat_hp8200e_rw_block_test(struct us_data *us, unsigned char access, unsigned char *registers, unsigned char *data_out, unsigned short num_registers, unsigned char data_reg, unsigned char status_reg, unsigned char timeout, unsigned char qualifier, int direction, void *buf, unsigned short len, int use_sg, int minutes) { int result; unsigned int pipe = (direction == DMA_FROM_DEVICE) ? us->recv_bulk_pipe : us->send_bulk_pipe; unsigned char *command = us->iobuf; int i, j; int cmdlen; unsigned char *data = us->iobuf; unsigned char *status = us->iobuf; BUG_ON(num_registers > US_IOBUF_SIZE/2); for (i=0; i<20; i++) { /* * The first time we send the full command, which consists * of downloading the SCSI command followed by downloading * the data via a write-and-test. Any other time we only * send the command to download the data -- the SCSI command * is still 'active' in some sense in the device. * * We're only going to try sending the data 10 times. After * that, we just return a failure. */ if (i==0) { cmdlen = 16; /* * Write to multiple registers * Not really sure the 0x07, 0x17, 0xfc, 0xe7 is * necessary here, but that's what came out of the * trace every single time. */ command[0] = 0x40; command[1] = access | USBAT_CMD_WRITE_REGS; command[2] = 0x07; command[3] = 0x17; command[4] = 0xFC; command[5] = 0xE7; command[6] = LSB_of(num_registers*2); command[7] = MSB_of(num_registers*2); } else cmdlen = 8; /* Conditionally read or write blocks */ command[cmdlen-8] = (direction==DMA_TO_DEVICE ? 0x40 : 0xC0); command[cmdlen-7] = access | (direction==DMA_TO_DEVICE ? USBAT_CMD_COND_WRITE_BLOCK : USBAT_CMD_COND_READ_BLOCK); command[cmdlen-6] = data_reg; command[cmdlen-5] = status_reg; command[cmdlen-4] = timeout; command[cmdlen-3] = qualifier; command[cmdlen-2] = LSB_of(len); command[cmdlen-1] = MSB_of(len); result = usbat_execute_command(us, command, cmdlen); if (result != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; if (i==0) { for (j=0; j<num_registers; j++) { data[j<<1] = registers[j]; data[1+(j<<1)] = data_out[j]; } result = usbat_bulk_write(us, data, num_registers*2, 0); if (result != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; } result = usb_stor_bulk_transfer_sg(us, pipe, buf, len, use_sg, NULL); /* * If we get a stall on the bulk download, we'll retry * the bulk download -- but not the SCSI command because * in some sense the SCSI command is still 'active' and * waiting for the data. Don't ask me why this should be; * I'm only following what the Windoze driver did. * * Note that a stall for the test-and-read/write command means * that the test failed. In this case we're testing to make * sure that the device is error-free * (i.e. bit 0 -- CHK -- of status is 0). The most likely * hypothesis is that the USBAT chip somehow knows what * the device will accept, but doesn't give the device any * data until all data is received. Thus, the device would * still be waiting for the first byte of data if a stall * occurs, even if the stall implies that some data was * transferred. */ if (result == USB_STOR_XFER_SHORT || result == USB_STOR_XFER_STALLED) { /* * If we're reading and we stalled, then clear * the bulk output pipe only the first time. */ if (direction==DMA_FROM_DEVICE && i==0) { if (usb_stor_clear_halt(us, us->send_bulk_pipe) < 0) return USB_STOR_TRANSPORT_ERROR; } /* * Read status: is the device angry, or just busy? */ result = usbat_read(us, USBAT_ATA, direction==DMA_TO_DEVICE ? USBAT_ATA_STATUS : USBAT_ATA_ALTSTATUS, status); if (result!=USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; if (*status & 0x01) /* check condition */ return USB_STOR_TRANSPORT_FAILED; if (*status & 0x20) /* device fault */ return USB_STOR_TRANSPORT_FAILED; usb_stor_dbg(us, "Redoing %s\n", str_write_read(direction == DMA_TO_DEVICE)); } else if (result != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; else return usbat_wait_not_busy(us, minutes); } usb_stor_dbg(us, "Bummer! %s bulk data 20 times failed\n", direction == DMA_TO_DEVICE ? "Writing" : "Reading"); return USB_STOR_TRANSPORT_FAILED; } /* * Write to multiple registers: * Allows us to write specific data to any registers. The data to be written * gets packed in this sequence: reg0, data0, reg1, data1, ..., regN, dataN * which gets sent through bulk out. * Not designed for large transfers of data! */ static int usbat_multiple_write(struct us_data *us, unsigned char *registers, unsigned char *data_out, unsigned short num_registers) { int i, result; unsigned char *data = us->iobuf; unsigned char *command = us->iobuf; BUG_ON(num_registers > US_IOBUF_SIZE/2); /* Write to multiple registers, ATA access */ command[0] = 0x40; command[1] = USBAT_ATA | USBAT_CMD_WRITE_REGS; /* No relevance */ command[2] = 0; command[3] = 0; command[4] = 0; command[5] = 0; /* Number of bytes to be transferred (incl. addresses and data) */ command[6] = LSB_of(num_registers*2); command[7] = MSB_of(num_registers*2); /* The setup command */ result = usbat_execute_command(us, command, 8); if (result != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; /* Create the reg/data, reg/data sequence */ for (i=0; i<num_registers; i++) { data[i<<1] = registers[i]; data[1+(i<<1)] = data_out[i]; } /* Send the data */ result = usbat_bulk_write(us, data, num_registers*2, 0); if (result != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; if (usbat_get_device_type(us) == USBAT_DEV_HP8200) return usbat_wait_not_busy(us, 0); else return USB_STOR_TRANSPORT_GOOD; } /* * Conditionally read blocks from device: * Allows us to read blocks from a specific data register, based upon the * condition that a status register can be successfully masked with a status * qualifier. If this condition is not initially met, the read will wait * up until a maximum amount of time has elapsed, as specified by timeout. * The read will start when the condition is met, otherwise the command aborts. * * The qualifier defined here is not the value that is masked, it defines * conditions for the write to take place. The actual masked qualifier (and * other related details) are defined beforehand with _set_shuttle_features(). */ static int usbat_read_blocks(struct us_data *us, void* buffer, int len, int use_sg) { int result; unsigned char *command = us->iobuf; command[0] = 0xC0; command[1] = USBAT_ATA | USBAT_CMD_COND_READ_BLOCK; command[2] = USBAT_ATA_DATA; command[3] = USBAT_ATA_STATUS; command[4] = 0xFD; /* Timeout (ms); */ command[5] = USBAT_QUAL_FCQ; command[6] = LSB_of(len); command[7] = MSB_of(len); /* Multiple block read setup command */ result = usbat_execute_command(us, command, 8); if (result != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_FAILED; /* Read the blocks we just asked for */ result = usbat_bulk_read(us, buffer, len, use_sg); if (result != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_FAILED; return USB_STOR_TRANSPORT_GOOD; } /* * Conditionally write blocks to device: * Allows us to write blocks to a specific data register, based upon the * condition that a status register can be successfully masked with a status * qualifier. If this condition is not initially met, the write will wait * up until a maximum amount of time has elapsed, as specified by timeout. * The read will start when the condition is met, otherwise the command aborts. * * The qualifier defined here is not the value that is masked, it defines * conditions for the write to take place. The actual masked qualifier (and * other related details) are defined beforehand with _set_shuttle_features(). */ static int usbat_write_blocks(struct us_data *us, void* buffer, int len, int use_sg) { int result; unsigned char *command = us->iobuf; command[0] = 0x40; command[1] = USBAT_ATA | USBAT_CMD_COND_WRITE_BLOCK; command[2] = USBAT_ATA_DATA; command[3] = USBAT_ATA_STATUS; command[4] = 0xFD; /* Timeout (ms) */ command[5] = USBAT_QUAL_FCQ; command[6] = LSB_of(len); command[7] = MSB_of(len); /* Multiple block write setup command */ result = usbat_execute_command(us, command, 8); if (result != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_FAILED; /* Write the data */ result = usbat_bulk_write(us, buffer, len, use_sg); if (result != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_FAILED; return USB_STOR_TRANSPORT_GOOD; } /* * Read the User IO register */ static int usbat_read_user_io(struct us_data *us, unsigned char *data_flags) { int result; result = usb_stor_ctrl_transfer(us, us->recv_ctrl_pipe, USBAT_CMD_UIO, 0xC0, 0, 0, data_flags, USBAT_UIO_READ); usb_stor_dbg(us, "UIO register reads %02X\n", *data_flags); return result; } /* * Write to the User IO register */ static int usbat_write_user_io(struct us_data *us, unsigned char enable_flags, unsigned char data_flags) { return usb_stor_ctrl_transfer(us, us->send_ctrl_pipe, USBAT_CMD_UIO, 0x40, short_pack(enable_flags, data_flags), 0, NULL, USBAT_UIO_WRITE); } /* * Reset the device * Often needed on media change. */ static int usbat_device_reset(struct us_data *us) { int rc; /* * Reset peripheral, enable peripheral control signals * (bring reset signal up) */ rc = usbat_write_user_io(us, USBAT_UIO_DRVRST | USBAT_UIO_OE1 | USBAT_UIO_OE0, USBAT_UIO_EPAD | USBAT_UIO_1); if (rc != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; /* * Enable peripheral control signals * (bring reset signal down) */ rc = usbat_write_user_io(us, USBAT_UIO_OE1 | USBAT_UIO_OE0, USBAT_UIO_EPAD | USBAT_UIO_1); if (rc != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; return USB_STOR_TRANSPORT_GOOD; } /* * Enable card detect */ static int usbat_device_enable_cdt(struct us_data *us) { int rc; /* Enable peripheral control signals and card detect */ rc = usbat_write_user_io(us, USBAT_UIO_ACKD | USBAT_UIO_OE1 | USBAT_UIO_OE0, USBAT_UIO_EPAD | USBAT_UIO_1); if (rc != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; return USB_STOR_TRANSPORT_GOOD; } /* * Determine if media is present. */ static int usbat_flash_check_media_present(struct us_data *us, unsigned char *uio) { if (*uio & USBAT_UIO_UI0) { usb_stor_dbg(us, "no media detected\n"); return USBAT_FLASH_MEDIA_NONE; } return USBAT_FLASH_MEDIA_CF; } /* * Determine if media has changed since last operation */ static int usbat_flash_check_media_changed(struct us_data *us, unsigned char *uio) { if (*uio & USBAT_UIO_0) { usb_stor_dbg(us, "media change detected\n"); return USBAT_FLASH_MEDIA_CHANGED; } return USBAT_FLASH_MEDIA_SAME; } /* * Check for media change / no media and handle the situation appropriately */ static int usbat_flash_check_media(struct us_data *us, struct usbat_info *info) { int rc; unsigned char *uio = us->iobuf; rc = usbat_read_user_io(us, uio); if (rc != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; /* Check for media existence */ rc = usbat_flash_check_media_present(us, uio); if (rc == USBAT_FLASH_MEDIA_NONE) { info->sense_key = 0x02; info->sense_asc = 0x3A; info->sense_ascq = 0x00; return USB_STOR_TRANSPORT_FAILED; } /* Check for media change */ rc = usbat_flash_check_media_changed(us, uio); if (rc == USBAT_FLASH_MEDIA_CHANGED) { /* Reset and re-enable card detect */ rc = usbat_device_reset(us); if (rc != USB_STOR_TRANSPORT_GOOD) return rc; rc = usbat_device_enable_cdt(us); if (rc != USB_STOR_TRANSPORT_GOOD) return rc; msleep(50); rc = usbat_read_user_io(us, uio); if (rc != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; info->sense_key = UNIT_ATTENTION; info->sense_asc = 0x28; info->sense_ascq = 0x00; return USB_STOR_TRANSPORT_FAILED; } return USB_STOR_TRANSPORT_GOOD; } /* * Determine whether we are controlling a flash-based reader/writer, * or a HP8200-based CD drive. * Sets transport functions as appropriate. */ static int usbat_identify_device(struct us_data *us, struct usbat_info *info) { int rc; unsigned char status; if (!us || !info) return USB_STOR_TRANSPORT_ERROR; rc = usbat_device_reset(us); if (rc != USB_STOR_TRANSPORT_GOOD) return rc; msleep(500); /* * In attempt to distinguish between HP CDRW's and Flash readers, we now * execute the IDENTIFY PACKET DEVICE command. On ATA devices (i.e. flash * readers), this command should fail with error. On ATAPI devices (i.e. * CDROM drives), it should succeed. */ rc = usbat_write(us, USBAT_ATA, USBAT_ATA_CMD, 0xA1); if (rc != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; rc = usbat_get_status(us, &status); if (rc != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; /* Check for error bit, or if the command 'fell through' */ if (status == 0xA1 || !(status & 0x01)) { /* Device is HP 8200 */ usb_stor_dbg(us, "Detected HP8200 CDRW\n"); info->devicetype = USBAT_DEV_HP8200; } else { /* Device is a CompactFlash reader/writer */ usb_stor_dbg(us, "Detected Flash reader/writer\n"); info->devicetype = USBAT_DEV_FLASH; } return USB_STOR_TRANSPORT_GOOD; } /* * Set the transport function based on the device type */ static int usbat_set_transport(struct us_data *us, struct usbat_info *info, int devicetype) { if (!info->devicetype) info->devicetype = devicetype; if (!info->devicetype) usbat_identify_device(us, info); switch (info->devicetype) { default: return USB_STOR_TRANSPORT_ERROR; case USBAT_DEV_HP8200: us->transport = usbat_hp8200e_transport; break; case USBAT_DEV_FLASH: us->transport = usbat_flash_transport; break; } return 0; } /* * Read the media capacity */ static int usbat_flash_get_sector_count(struct us_data *us, struct usbat_info *info) { unsigned char registers[3] = { USBAT_ATA_SECCNT, USBAT_ATA_DEVICE, USBAT_ATA_CMD, }; unsigned char command[3] = { 0x01, 0xA0, 0xEC }; unsigned char *reply; unsigned char status; int rc; if (!us || !info) return USB_STOR_TRANSPORT_ERROR; reply = kmalloc(512, GFP_NOIO); if (!reply) return USB_STOR_TRANSPORT_ERROR; /* ATA command : IDENTIFY DEVICE */ rc = usbat_multiple_write(us, registers, command, 3); if (rc != USB_STOR_XFER_GOOD) { usb_stor_dbg(us, "Gah! identify_device failed\n"); rc = USB_STOR_TRANSPORT_ERROR; goto leave; } /* Read device status */ if (usbat_get_status(us, &status) != USB_STOR_XFER_GOOD) { rc = USB_STOR_TRANSPORT_ERROR; goto leave; } msleep(100); /* Read the device identification data */ rc = usbat_read_block(us, reply, 512, 0); if (rc != USB_STOR_TRANSPORT_GOOD) goto leave; info->sectors = ((u32)(reply[117]) << 24) | ((u32)(reply[116]) << 16) | ((u32)(reply[115]) << 8) | ((u32)(reply[114]) ); rc = USB_STOR_TRANSPORT_GOOD; leave: kfree(reply); return rc; } /* * Read data from device */ static int usbat_flash_read_data(struct us_data *us, struct usbat_info *info, u32 sector, u32 sectors) { unsigned char registers[7] = { USBAT_ATA_FEATURES, USBAT_ATA_SECCNT, USBAT_ATA_SECNUM, USBAT_ATA_LBA_ME, USBAT_ATA_LBA_HI, USBAT_ATA_DEVICE, USBAT_ATA_STATUS, }; unsigned char command[7]; unsigned char *buffer; unsigned char thistime; unsigned int totallen, alloclen; int len, result; unsigned int sg_offset = 0; struct scatterlist *sg = NULL; result = usbat_flash_check_media(us, info); if (result != USB_STOR_TRANSPORT_GOOD) return result; /* * we're working in LBA mode. according to the ATA spec, * we can support up to 28-bit addressing. I don't know if Jumpshot * supports beyond 24-bit addressing. It's kind of hard to test * since it requires > 8GB CF card. */ if (sector > 0x0FFFFFFF) return USB_STOR_TRANSPORT_ERROR; totallen = sectors * info->ssize; /* * Since we don't read more than 64 KB at a time, we have to create * a bounce buffer and move the data a piece at a time between the * bounce buffer and the actual transfer buffer. */ alloclen = min(totallen, 65536u); buffer = kmalloc(alloclen, GFP_NOIO); if (buffer == NULL) return USB_STOR_TRANSPORT_ERROR; do { /* * loop, never allocate or transfer more than 64k at once * (min(128k, 255*info->ssize) is the real limit) */ len = min(totallen, alloclen); thistime = (len / info->ssize) & 0xff; /* ATA command 0x20 (READ SECTORS) */ usbat_pack_ata_sector_cmd(command, thistime, sector, 0x20); /* Write/execute ATA read command */ result = usbat_multiple_write(us, registers, command, 7); if (result != USB_STOR_TRANSPORT_GOOD) goto leave; /* Read the data we just requested */ result = usbat_read_blocks(us, buffer, len, 0); if (result != USB_STOR_TRANSPORT_GOOD) goto leave; usb_stor_dbg(us, "%d bytes\n", len); /* Store the data in the transfer buffer */ usb_stor_access_xfer_buf(buffer, len, us->srb, &sg, &sg_offset, TO_XFER_BUF); sector += thistime; totallen -= len; } while (totallen > 0); kfree(buffer); return USB_STOR_TRANSPORT_GOOD; leave: kfree(buffer); return USB_STOR_TRANSPORT_ERROR; } /* * Write data to device */ static int usbat_flash_write_data(struct us_data *us, struct usbat_info *info, u32 sector, u32 sectors) { unsigned char registers[7] = { USBAT_ATA_FEATURES, USBAT_ATA_SECCNT, USBAT_ATA_SECNUM, USBAT_ATA_LBA_ME, USBAT_ATA_LBA_HI, USBAT_ATA_DEVICE, USBAT_ATA_STATUS, }; unsigned char command[7]; unsigned char *buffer; unsigned char thistime; unsigned int totallen, alloclen; int len, result; unsigned int sg_offset = 0; struct scatterlist *sg = NULL; result = usbat_flash_check_media(us, info); if (result != USB_STOR_TRANSPORT_GOOD) return result; /* * we're working in LBA mode. according to the ATA spec, * we can support up to 28-bit addressing. I don't know if the device * supports beyond 24-bit addressing. It's kind of hard to test * since it requires > 8GB media. */ if (sector > 0x0FFFFFFF) return USB_STOR_TRANSPORT_ERROR; totallen = sectors * info->ssize; /* * Since we don't write more than 64 KB at a time, we have to create * a bounce buffer and move the data a piece at a time between the * bounce buffer and the actual transfer buffer. */ alloclen = min(totallen, 65536u); buffer = kmalloc(alloclen, GFP_NOIO); if (buffer == NULL) return USB_STOR_TRANSPORT_ERROR; do { /* * loop, never allocate or transfer more than 64k at once * (min(128k, 255*info->ssize) is the real limit) */ len = min(totallen, alloclen); thistime = (len / info->ssize) & 0xff; /* Get the data from the transfer buffer */ usb_stor_access_xfer_buf(buffer, len, us->srb, &sg, &sg_offset, FROM_XFER_BUF); /* ATA command 0x30 (WRITE SECTORS) */ usbat_pack_ata_sector_cmd(command, thistime, sector, 0x30); /* Write/execute ATA write command */ result = usbat_multiple_write(us, registers, command, 7); if (result != USB_STOR_TRANSPORT_GOOD) goto leave; /* Write the data */ result = usbat_write_blocks(us, buffer, len, 0); if (result != USB_STOR_TRANSPORT_GOOD) goto leave; sector += thistime; totallen -= len; } while (totallen > 0); kfree(buffer); return result; leave: kfree(buffer); return USB_STOR_TRANSPORT_ERROR; } /* * Squeeze a potentially huge (> 65535 byte) read10 command into * a little ( <= 65535 byte) ATAPI pipe */ static int usbat_hp8200e_handle_read10(struct us_data *us, unsigned char *registers, unsigned char *data, struct scsi_cmnd *srb) { int result = USB_STOR_TRANSPORT_GOOD; unsigned char *buffer; unsigned int len; unsigned int sector; unsigned int sg_offset = 0; struct scatterlist *sg = NULL; usb_stor_dbg(us, "transfersize %d\n", srb->transfersize); if (scsi_bufflen(srb) < 0x10000) { result = usbat_hp8200e_rw_block_test(us, USBAT_ATA, registers, data, 19, USBAT_ATA_DATA, USBAT_ATA_STATUS, 0xFD, (USBAT_QUAL_FCQ | USBAT_QUAL_ALQ), DMA_FROM_DEVICE, scsi_sglist(srb), scsi_bufflen(srb), scsi_sg_count(srb), 1); return result; } /* * Since we're requesting more data than we can handle in * a single read command (max is 64k-1), we will perform * multiple reads, but each read must be in multiples of * a sector. Luckily the sector size is in srb->transfersize * (see linux/drivers/scsi/sr.c). */ if (data[7+0] == GPCMD_READ_CD) { len = short_pack(data[7+9], data[7+8]); len <<= 16; len |= data[7+7]; usb_stor_dbg(us, "GPCMD_READ_CD: len %d\n", len); srb->transfersize = scsi_bufflen(srb)/len; } if (!srb->transfersize) { srb->transfersize = 2048; /* A guess */ usb_stor_dbg(us, "transfersize 0, forcing %d\n", srb->transfersize); } /* * Since we only read in one block at a time, we have to create * a bounce buffer and move the data a piece at a time between the * bounce buffer and the actual transfer buffer. */ len = (65535/srb->transfersize) * srb->transfersize; usb_stor_dbg(us, "Max read is %d bytes\n", len); len = min(len, scsi_bufflen(srb)); buffer = kmalloc(len, GFP_NOIO); if (buffer == NULL) /* bloody hell! */ return USB_STOR_TRANSPORT_FAILED; sector = short_pack(data[7+3], data[7+2]); sector <<= 16; sector |= short_pack(data[7+5], data[7+4]); transferred = 0; while (transferred != scsi_bufflen(srb)) { if (len > scsi_bufflen(srb) - transferred) len = scsi_bufflen(srb) - transferred; data[3] = len&0xFF; /* (cylL) = expected length (L) */ data[4] = (len>>8)&0xFF; /* (cylH) = expected length (H) */ /* Fix up the SCSI command sector and num sectors */ data[7+2] = MSB_of(sector>>16); /* SCSI command sector */ data[7+3] = LSB_of(sector>>16); data[7+4] = MSB_of(sector&0xFFFF); data[7+5] = LSB_of(sector&0xFFFF); if (data[7+0] == GPCMD_READ_CD) data[7+6] = 0; data[7+7] = MSB_of(len / srb->transfersize); /* SCSI command */ data[7+8] = LSB_of(len / srb->transfersize); /* num sectors */ result = usbat_hp8200e_rw_block_test(us, USBAT_ATA, registers, data, 19, USBAT_ATA_DATA, USBAT_ATA_STATUS, 0xFD, (USBAT_QUAL_FCQ | USBAT_QUAL_ALQ), DMA_FROM_DEVICE, buffer, len, 0, 1); if (result != USB_STOR_TRANSPORT_GOOD) break; /* Store the data in the transfer buffer */ usb_stor_access_xfer_buf(buffer, len, srb, &sg, &sg_offset, TO_XFER_BUF); /* Update the amount transferred and the sector number */ transferred += len; sector += len / srb->transfersize; } /* while transferred != scsi_bufflen(srb) */ kfree(buffer); return result; } static int usbat_select_and_test_registers(struct us_data *us) { int selector; unsigned char *status = us->iobuf; /* try device = master, then device = slave. */ for (selector = 0xA0; selector <= 0xB0; selector += 0x10) { if (usbat_write(us, USBAT_ATA, USBAT_ATA_DEVICE, selector) != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; if (usbat_read(us, USBAT_ATA, USBAT_ATA_STATUS, status) != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; if (usbat_read(us, USBAT_ATA, USBAT_ATA_DEVICE, status) != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; if (usbat_read(us, USBAT_ATA, USBAT_ATA_LBA_ME, status) != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; if (usbat_read(us, USBAT_ATA, USBAT_ATA_LBA_HI, status) != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; if (usbat_write(us, USBAT_ATA, USBAT_ATA_LBA_ME, 0x55) != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; if (usbat_write(us, USBAT_ATA, USBAT_ATA_LBA_HI, 0xAA) != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; if (usbat_read(us, USBAT_ATA, USBAT_ATA_LBA_ME, status) != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; if (usbat_read(us, USBAT_ATA, USBAT_ATA_LBA_ME, status) != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; } return USB_STOR_TRANSPORT_GOOD; } /* * Initialize the USBAT processor and the storage device */ static int init_usbat(struct us_data *us, int devicetype) { int rc; struct usbat_info *info; unsigned char subcountH = USBAT_ATA_LBA_HI; unsigned char subcountL = USBAT_ATA_LBA_ME; unsigned char *status = us->iobuf; us->extra = kzalloc(sizeof(struct usbat_info), GFP_NOIO); if (!us->extra) return -ENOMEM; info = (struct usbat_info *) (us->extra); /* Enable peripheral control signals */ rc = usbat_write_user_io(us, USBAT_UIO_OE1 | USBAT_UIO_OE0, USBAT_UIO_EPAD | USBAT_UIO_1); if (rc != USB_STOR_XFER_GOOD) return -EIO; usb_stor_dbg(us, "INIT 1\n"); msleep(2000); rc = usbat_read_user_io(us, status); if (rc != USB_STOR_TRANSPORT_GOOD) return -EIO; usb_stor_dbg(us, "INIT 2\n"); rc = usbat_read_user_io(us, status); if (rc != USB_STOR_XFER_GOOD) return -EIO; rc = usbat_read_user_io(us, status); if (rc != USB_STOR_XFER_GOOD) return -EIO; usb_stor_dbg(us, "INIT 3\n"); rc = usbat_select_and_test_registers(us); if (rc != USB_STOR_TRANSPORT_GOOD) return -EIO; usb_stor_dbg(us, "INIT 4\n"); rc = usbat_read_user_io(us, status); if (rc != USB_STOR_XFER_GOOD) return -EIO; usb_stor_dbg(us, "INIT 5\n"); /* Enable peripheral control signals and card detect */ rc = usbat_device_enable_cdt(us); if (rc != USB_STOR_TRANSPORT_GOOD) return -EIO; usb_stor_dbg(us, "INIT 6\n"); rc = usbat_read_user_io(us, status); if (rc != USB_STOR_XFER_GOOD) return -EIO; usb_stor_dbg(us, "INIT 7\n"); msleep(1400); rc = usbat_read_user_io(us, status); if (rc != USB_STOR_XFER_GOOD) return -EIO; usb_stor_dbg(us, "INIT 8\n"); rc = usbat_select_and_test_registers(us); if (rc != USB_STOR_TRANSPORT_GOOD) return -EIO; usb_stor_dbg(us, "INIT 9\n"); /* At this point, we need to detect which device we are using */ if (usbat_set_transport(us, info, devicetype)) return -EIO; usb_stor_dbg(us, "INIT 10\n"); if (usbat_get_device_type(us) == USBAT_DEV_FLASH) { subcountH = 0x02; subcountL = 0x00; } rc = usbat_set_shuttle_features(us, (USBAT_FEAT_ETEN | USBAT_FEAT_ET2 | USBAT_FEAT_ET1), 0x00, 0x88, 0x08, subcountH, subcountL); if (rc != USB_STOR_XFER_GOOD) return -EIO; usb_stor_dbg(us, "INIT 11\n"); return 0; } /* * Transport for the HP 8200e */ static int usbat_hp8200e_transport(struct scsi_cmnd *srb, struct us_data *us) { int result; unsigned char *status = us->iobuf; unsigned char registers[32]; unsigned char data[32]; unsigned int len; int i; len = scsi_bufflen(srb); /* * Send A0 (ATA PACKET COMMAND). * Note: I guess we're never going to get any of the ATA * commands... just ATA Packet Commands. */ registers[0] = USBAT_ATA_FEATURES; registers[1] = USBAT_ATA_SECCNT; registers[2] = USBAT_ATA_SECNUM; registers[3] = USBAT_ATA_LBA_ME; registers[4] = USBAT_ATA_LBA_HI; registers[5] = USBAT_ATA_DEVICE; registers[6] = USBAT_ATA_CMD; data[0] = 0x00; data[1] = 0x00; data[2] = 0x00; data[3] = len&0xFF; /* (cylL) = expected length (L) */ data[4] = (len>>8)&0xFF; /* (cylH) = expected length (H) */ data[5] = 0xB0; /* (device sel) = slave */ data[6] = 0xA0; /* (command) = ATA PACKET COMMAND */ for (i=7; i<19; i++) { registers[i] = 0x10; data[i] = (i-7 >= srb->cmd_len) ? 0 : srb->cmnd[i-7]; } result = usbat_get_status(us, status); usb_stor_dbg(us, "Status = %02X\n", *status); if (result != USB_STOR_XFER_GOOD) return USB_STOR_TRANSPORT_ERROR; if (srb->cmnd[0] == TEST_UNIT_READY) transferred = 0; if (srb->sc_data_direction == DMA_TO_DEVICE) { result = usbat_hp8200e_rw_block_test(us, USBAT_ATA, registers, data, 19, USBAT_ATA_DATA, USBAT_ATA_STATUS, 0xFD, (USBAT_QUAL_FCQ | USBAT_QUAL_ALQ), DMA_TO_DEVICE, scsi_sglist(srb), len, scsi_sg_count(srb), 10); if (result == USB_STOR_TRANSPORT_GOOD) { transferred += len; usb_stor_dbg(us, "Wrote %08X bytes\n", transferred); } return result; } else if (srb->cmnd[0] == READ_10 || srb->cmnd[0] == GPCMD_READ_CD) { return usbat_hp8200e_handle_read10(us, registers, data, srb); } if (len > 0xFFFF) { usb_stor_dbg(us, "Error: len = %08X... what do I do now?\n", len); return USB_STOR_TRANSPORT_ERROR; } result = usbat_multiple_write(us, registers, data, 7); if (result != USB_STOR_TRANSPORT_GOOD) return result; /* * Write the 12-byte command header. * * If the command is BLANK then set the timer for 75 minutes. * Otherwise set it for 10 minutes. * * NOTE: THE 8200 DOCUMENTATION STATES THAT BLANKING A CDRW * AT SPEED 4 IS UNRELIABLE!!! */ result = usbat_write_block(us, USBAT_ATA, srb->cmnd, 12, srb->cmnd[0] == GPCMD_BLANK ? 75 : 10, 0); if (result != USB_STOR_TRANSPORT_GOOD) return result; /* If there is response data to be read in then do it here. */ if (len != 0 && (srb->sc_data_direction == DMA_FROM_DEVICE)) { /* How many bytes to read in? Check cylL register */ if (usbat_read(us, USBAT_ATA, USBAT_ATA_LBA_ME, status) != USB_STOR_XFER_GOOD) { return USB_STOR_TRANSPORT_ERROR; } if (len > 0xFF) { /* need to read cylH also */ len = *status; if (usbat_read(us, USBAT_ATA, USBAT_ATA_LBA_HI, status) != USB_STOR_XFER_GOOD) { return USB_STOR_TRANSPORT_ERROR; } len += ((unsigned int) *status)<<8; } else len = *status; result = usbat_read_block(us, scsi_sglist(srb), len, scsi_sg_count(srb)); } return result; } /* * Transport for USBAT02-based CompactFlash and similar storage devices */ static int usbat_flash_transport(struct scsi_cmnd * srb, struct us_data *us) { int rc; struct usbat_info *info = (struct usbat_info *) (us->extra); unsigned long block, blocks; unsigned char *ptr = us->iobuf; static const unsigned char inquiry_response[36] = { 0x00, 0x80, 0x00, 0x01, 0x1F, 0x00, 0x00, 0x00 }; if (srb->cmnd[0] == INQUIRY) { usb_stor_dbg(us, "INQUIRY - Returning bogus response\n"); memcpy(ptr, inquiry_response, sizeof(inquiry_response)); fill_inquiry_response(us, ptr, 36); return USB_STOR_TRANSPORT_GOOD; } if (srb->cmnd[0] == READ_CAPACITY) { rc = usbat_flash_check_media(us, info); if (rc != USB_STOR_TRANSPORT_GOOD) return rc; rc = usbat_flash_get_sector_count(us, info); if (rc != USB_STOR_TRANSPORT_GOOD) return rc; /* hard coded 512 byte sectors as per ATA spec */ info->ssize = 0x200; usb_stor_dbg(us, "READ_CAPACITY: %ld sectors, %ld bytes per sector\n", info->sectors, info->ssize); /* * build the reply * note: must return the sector number of the last sector, * *not* the total number of sectors */ ((__be32 *) ptr)[0] = cpu_to_be32(info->sectors - 1); ((__be32 *) ptr)[1] = cpu_to_be32(info->ssize); usb_stor_set_xfer_buf(ptr, 8, srb); return USB_STOR_TRANSPORT_GOOD; } if (srb->cmnd[0] == MODE_SELECT_10) { usb_stor_dbg(us, "Gah! MODE_SELECT_10\n"); return USB_STOR_TRANSPORT_ERROR; } if (srb->cmnd[0] == READ_10) { block = ((u32)(srb->cmnd[2]) << 24) | ((u32)(srb->cmnd[3]) << 16) | ((u32)(srb->cmnd[4]) << 8) | ((u32)(srb->cmnd[5])); blocks = ((u32)(srb->cmnd[7]) << 8) | ((u32)(srb->cmnd[8])); usb_stor_dbg(us, "READ_10: read block 0x%04lx count %ld\n", block, blocks); return usbat_flash_read_data(us, info, block, blocks); } if (srb->cmnd[0] == READ_12) { /* * I don't think we'll ever see a READ_12 but support it anyway */ block = ((u32)(srb->cmnd[2]) << 24) | ((u32)(srb->cmnd[3]) << 16) | ((u32)(srb->cmnd[4]) << 8) | ((u32)(srb->cmnd[5])); blocks = ((u32)(srb->cmnd[6]) << 24) | ((u32)(srb->cmnd[7]) << 16) | ((u32)(srb->cmnd[8]) << 8) | ((u32)(srb->cmnd[9])); usb_stor_dbg(us, "READ_12: read block 0x%04lx count %ld\n", block, blocks); return usbat_flash_read_data(us, info, block, blocks); } if (srb->cmnd[0] == WRITE_10) { block = ((u32)(srb->cmnd[2]) << 24) | ((u32)(srb->cmnd[3]) << 16) | ((u32)(srb->cmnd[4]) << 8) | ((u32)(srb->cmnd[5])); blocks = ((u32)(srb->cmnd[7]) << 8) | ((u32)(srb->cmnd[8])); usb_stor_dbg(us, "WRITE_10: write block 0x%04lx count %ld\n", block, blocks); return usbat_flash_write_data(us, info, block, blocks); } if (srb->cmnd[0] == WRITE_12) { /* * I don't think we'll ever see a WRITE_12 but support it anyway */ block = ((u32)(srb->cmnd[2]) << 24) | ((u32)(srb->cmnd[3]) << 16) | ((u32)(srb->cmnd[4]) << 8) | ((u32)(srb->cmnd[5])); blocks = ((u32)(srb->cmnd[6]) << 24) | ((u32)(srb->cmnd[7]) << 16) | ((u32)(srb->cmnd[8]) << 8) | ((u32)(srb->cmnd[9])); usb_stor_dbg(us, "WRITE_12: write block 0x%04lx count %ld\n", block, blocks); return usbat_flash_write_data(us, info, block, blocks); } if (srb->cmnd[0] == TEST_UNIT_READY) { usb_stor_dbg(us, "TEST_UNIT_READY\n"); rc = usbat_flash_check_media(us, info); if (rc != USB_STOR_TRANSPORT_GOOD) return rc; return usbat_check_status(us); } if (srb->cmnd[0] == REQUEST_SENSE) { usb_stor_dbg(us, "REQUEST_SENSE\n"); memset(ptr, 0, 18); ptr[0] = 0xF0; ptr[2] = info->sense_key; ptr[7] = 11; ptr[12] = info->sense_asc; ptr[13] = info->sense_ascq; usb_stor_set_xfer_buf(ptr, 18, srb); return USB_STOR_TRANSPORT_GOOD; } if (srb->cmnd[0] == ALLOW_MEDIUM_REMOVAL) { /* * sure. whatever. not like we can stop the user from popping * the media out of the device (no locking doors, etc) */ return USB_STOR_TRANSPORT_GOOD; } usb_stor_dbg(us, "Gah! Unknown command: %d (0x%x)\n", srb->cmnd[0], srb->cmnd[0]); info->sense_key = 0x05; info->sense_asc = 0x20; info->sense_ascq = 0x00; return USB_STOR_TRANSPORT_FAILED; } static int init_usbat_cd(struct us_data *us) { return init_usbat(us, USBAT_DEV_HP8200); } static int init_usbat_flash(struct us_data *us) { return init_usbat(us, USBAT_DEV_FLASH); } static struct scsi_host_template usbat_host_template; static int usbat_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct us_data *us; int result; result = usb_stor_probe1(&us, intf, id, (id - usbat_usb_ids) + usbat_unusual_dev_list, &usbat_host_template); if (result) return result; /* * The actual transport will be determined later by the * initialization routine; this is just a placeholder. */ us->transport_name = "Shuttle USBAT"; us->transport = usbat_flash_transport; us->transport_reset = usb_stor_CB_reset; us->max_lun = 0; result = usb_stor_probe2(us); return result; } static struct usb_driver usbat_driver = { .name = DRV_NAME, .probe = usbat_probe, .disconnect = usb_stor_disconnect, .suspend = usb_stor_suspend, .resume = usb_stor_resume, .reset_resume = usb_stor_reset_resume, .pre_reset = usb_stor_pre_reset, .post_reset = usb_stor_post_reset, .id_table = usbat_usb_ids, .soft_unbind = 1, .no_dynamic_id = 1, }; module_usb_stor_driver(usbat_driver, usbat_host_template, DRV_NAME); |
| 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 10 11 10 11 10 11 13 13 13 13 12 13 13 5 8 8 8 8 8 6 6 6 6 6 5 5 4 4 6 8 1 1 8 8 5 9 9 9 9 9 9 8 8 8 8 8 5 5 5 5 5 8 8 8 8 8 8 8 8 8 8 4 5 4 5 5 5 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 2 2 1 2 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Core registration and callback routines for MTD * drivers and users. * * Copyright © 1999-2010 David Woodhouse <dwmw2@infradead.org> * Copyright © 2006 Red Hat UK Limited */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/ptrace.h> #include <linux/seq_file.h> #include <linux/string.h> #include <linux/timer.h> #include <linux/major.h> #include <linux/fs.h> #include <linux/err.h> #include <linux/ioctl.h> #include <linux/init.h> #include <linux/of.h> #include <linux/proc_fs.h> #include <linux/idr.h> #include <linux/backing-dev.h> #include <linux/gfp.h> #include <linux/random.h> #include <linux/slab.h> #include <linux/reboot.h> #include <linux/leds.h> #include <linux/debugfs.h> #include <linux/nvmem-provider.h> #include <linux/root_dev.h> #include <linux/error-injection.h> #include <linux/mtd/mtd.h> #include <linux/mtd/partitions.h> #include "mtdcore.h" struct backing_dev_info *mtd_bdi; #ifdef CONFIG_PM_SLEEP static int mtd_cls_suspend(struct device *dev) { struct mtd_info *mtd = dev_get_drvdata(dev); return mtd ? mtd_suspend(mtd) : 0; } static int mtd_cls_resume(struct device *dev) { struct mtd_info *mtd = dev_get_drvdata(dev); if (mtd) mtd_resume(mtd); return 0; } static SIMPLE_DEV_PM_OPS(mtd_cls_pm_ops, mtd_cls_suspend, mtd_cls_resume); #define MTD_CLS_PM_OPS (&mtd_cls_pm_ops) #else #define MTD_CLS_PM_OPS NULL #endif static struct class mtd_class = { .name = "mtd", .pm = MTD_CLS_PM_OPS, }; static DEFINE_IDR(mtd_idr); /* These are exported solely for the purpose of mtd_blkdevs.c. You should not use them for _anything_ else */ DEFINE_MUTEX(mtd_table_mutex); EXPORT_SYMBOL_GPL(mtd_table_mutex); struct mtd_info *__mtd_next_device(int i) { return idr_get_next(&mtd_idr, &i); } EXPORT_SYMBOL_GPL(__mtd_next_device); static LIST_HEAD(mtd_notifiers); #define MTD_DEVT(index) MKDEV(MTD_CHAR_MAJOR, (index)*2) /* REVISIT once MTD uses the driver model better, whoever allocates * the mtd_info will probably want to use the release() hook... */ static void mtd_release(struct device *dev) { struct mtd_info *mtd = dev_get_drvdata(dev); dev_t index = MTD_DEVT(mtd->index); idr_remove(&mtd_idr, mtd->index); of_node_put(mtd_get_of_node(mtd)); if (mtd_is_partition(mtd)) release_mtd_partition(mtd); /* remove /dev/mtdXro node */ device_destroy(&mtd_class, index + 1); } static void mtd_device_release(struct kref *kref) { struct mtd_info *mtd = container_of(kref, struct mtd_info, refcnt); bool is_partition = mtd_is_partition(mtd); debugfs_remove_recursive(mtd->dbg.dfs_dir); /* Try to remove the NVMEM provider */ nvmem_unregister(mtd->nvmem); device_unregister(&mtd->dev); /* * Clear dev so mtd can be safely re-registered later if desired. * Should not be done for partition, * as it was already destroyed in device_unregister(). */ if (!is_partition) memset(&mtd->dev, 0, sizeof(mtd->dev)); module_put(THIS_MODULE); } #define MTD_DEVICE_ATTR_RO(name) \ static DEVICE_ATTR(name, 0444, mtd_##name##_show, NULL) #define MTD_DEVICE_ATTR_RW(name) \ static DEVICE_ATTR(name, 0644, mtd_##name##_show, mtd_##name##_store) static ssize_t mtd_type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); char *type; switch (mtd->type) { case MTD_ABSENT: type = "absent"; break; case MTD_RAM: type = "ram"; break; case MTD_ROM: type = "rom"; break; case MTD_NORFLASH: type = "nor"; break; case MTD_NANDFLASH: type = "nand"; break; case MTD_DATAFLASH: type = "dataflash"; break; case MTD_UBIVOLUME: type = "ubi"; break; case MTD_MLCNANDFLASH: type = "mlc-nand"; break; default: type = "unknown"; } return sysfs_emit(buf, "%s\n", type); } MTD_DEVICE_ATTR_RO(type); static ssize_t mtd_flags_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); return sysfs_emit(buf, "0x%lx\n", (unsigned long)mtd->flags); } MTD_DEVICE_ATTR_RO(flags); static ssize_t mtd_size_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); return sysfs_emit(buf, "%llu\n", (unsigned long long)mtd->size); } MTD_DEVICE_ATTR_RO(size); static ssize_t mtd_erasesize_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); return sysfs_emit(buf, "%lu\n", (unsigned long)mtd->erasesize); } MTD_DEVICE_ATTR_RO(erasesize); static ssize_t mtd_writesize_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); return sysfs_emit(buf, "%lu\n", (unsigned long)mtd->writesize); } MTD_DEVICE_ATTR_RO(writesize); static ssize_t mtd_subpagesize_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); unsigned int subpagesize = mtd->writesize >> mtd->subpage_sft; return sysfs_emit(buf, "%u\n", subpagesize); } MTD_DEVICE_ATTR_RO(subpagesize); static ssize_t mtd_oobsize_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); return sysfs_emit(buf, "%lu\n", (unsigned long)mtd->oobsize); } MTD_DEVICE_ATTR_RO(oobsize); static ssize_t mtd_oobavail_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); return sysfs_emit(buf, "%u\n", mtd->oobavail); } MTD_DEVICE_ATTR_RO(oobavail); static ssize_t mtd_numeraseregions_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); return sysfs_emit(buf, "%u\n", mtd->numeraseregions); } MTD_DEVICE_ATTR_RO(numeraseregions); static ssize_t mtd_name_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); return sysfs_emit(buf, "%s\n", mtd->name); } MTD_DEVICE_ATTR_RO(name); static ssize_t mtd_ecc_strength_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); return sysfs_emit(buf, "%u\n", mtd->ecc_strength); } MTD_DEVICE_ATTR_RO(ecc_strength); static ssize_t mtd_bitflip_threshold_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); return sysfs_emit(buf, "%u\n", mtd->bitflip_threshold); } static ssize_t mtd_bitflip_threshold_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct mtd_info *mtd = dev_get_drvdata(dev); unsigned int bitflip_threshold; int retval; retval = kstrtouint(buf, 0, &bitflip_threshold); if (retval) return retval; mtd->bitflip_threshold = bitflip_threshold; return count; } MTD_DEVICE_ATTR_RW(bitflip_threshold); static ssize_t mtd_ecc_step_size_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); return sysfs_emit(buf, "%u\n", mtd->ecc_step_size); } MTD_DEVICE_ATTR_RO(ecc_step_size); static ssize_t mtd_corrected_bits_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); struct mtd_ecc_stats *ecc_stats = &mtd->ecc_stats; return sysfs_emit(buf, "%u\n", ecc_stats->corrected); } MTD_DEVICE_ATTR_RO(corrected_bits); /* ecc stats corrected */ static ssize_t mtd_ecc_failures_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); struct mtd_ecc_stats *ecc_stats = &mtd->ecc_stats; return sysfs_emit(buf, "%u\n", ecc_stats->failed); } MTD_DEVICE_ATTR_RO(ecc_failures); /* ecc stats errors */ static ssize_t mtd_bad_blocks_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); struct mtd_ecc_stats *ecc_stats = &mtd->ecc_stats; return sysfs_emit(buf, "%u\n", ecc_stats->badblocks); } MTD_DEVICE_ATTR_RO(bad_blocks); static ssize_t mtd_bbt_blocks_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); struct mtd_ecc_stats *ecc_stats = &mtd->ecc_stats; return sysfs_emit(buf, "%u\n", ecc_stats->bbtblocks); } MTD_DEVICE_ATTR_RO(bbt_blocks); static struct attribute *mtd_attrs[] = { &dev_attr_type.attr, &dev_attr_flags.attr, &dev_attr_size.attr, &dev_attr_erasesize.attr, &dev_attr_writesize.attr, &dev_attr_subpagesize.attr, &dev_attr_oobsize.attr, &dev_attr_oobavail.attr, &dev_attr_numeraseregions.attr, &dev_attr_name.attr, &dev_attr_ecc_strength.attr, &dev_attr_ecc_step_size.attr, &dev_attr_corrected_bits.attr, &dev_attr_ecc_failures.attr, &dev_attr_bad_blocks.attr, &dev_attr_bbt_blocks.attr, &dev_attr_bitflip_threshold.attr, NULL, }; ATTRIBUTE_GROUPS(mtd); static const struct device_type mtd_devtype = { .name = "mtd", .groups = mtd_groups, .release = mtd_release, }; static bool mtd_expert_analysis_mode; #ifdef CONFIG_DEBUG_FS bool mtd_check_expert_analysis_mode(void) { const char *mtd_expert_analysis_warning = "Bad block checks have been entirely disabled.\n" "This is only reserved for post-mortem forensics and debug purposes.\n" "Never enable this mode if you do not know what you are doing!\n"; return WARN_ONCE(mtd_expert_analysis_mode, mtd_expert_analysis_warning); } EXPORT_SYMBOL_GPL(mtd_check_expert_analysis_mode); #endif static struct dentry *dfs_dir_mtd; static int mtd_ooblayout_show(struct seq_file *s, void *p, int (*iter)(struct mtd_info *, int section, struct mtd_oob_region *region)) { struct mtd_info *mtd = s->private; int section; for (section = 0;; section++) { struct mtd_oob_region region; int err; err = iter(mtd, section, ®ion); if (err) { if (err == -ERANGE) break; return err; } seq_printf(s, "%-3d %4u %4u\n", section, region.offset, region.length); } return 0; } static int mtd_ooblayout_ecc_show(struct seq_file *s, void *p) { return mtd_ooblayout_show(s, p, mtd_ooblayout_ecc); } DEFINE_SHOW_ATTRIBUTE(mtd_ooblayout_ecc); static int mtd_ooblayout_free_show(struct seq_file *s, void *p) { return mtd_ooblayout_show(s, p, mtd_ooblayout_free); } DEFINE_SHOW_ATTRIBUTE(mtd_ooblayout_free); static void mtd_debugfs_populate(struct mtd_info *mtd) { struct device *dev = &mtd->dev; struct mtd_oob_region region; if (IS_ERR_OR_NULL(dfs_dir_mtd)) return; mtd->dbg.dfs_dir = debugfs_create_dir(dev_name(dev), dfs_dir_mtd); if (IS_ERR_OR_NULL(mtd->dbg.dfs_dir)) return; /* Create ooblayout files only if at least one region is present. */ if (mtd_ooblayout_ecc(mtd, 0, ®ion) == 0) debugfs_create_file("ooblayout_ecc", 0444, mtd->dbg.dfs_dir, mtd, &mtd_ooblayout_ecc_fops); if (mtd_ooblayout_free(mtd, 0, ®ion) == 0) debugfs_create_file("ooblayout_free", 0444, mtd->dbg.dfs_dir, mtd, &mtd_ooblayout_free_fops); } #ifndef CONFIG_MMU unsigned mtd_mmap_capabilities(struct mtd_info *mtd) { switch (mtd->type) { case MTD_RAM: return NOMMU_MAP_COPY | NOMMU_MAP_DIRECT | NOMMU_MAP_EXEC | NOMMU_MAP_READ | NOMMU_MAP_WRITE; case MTD_ROM: return NOMMU_MAP_COPY | NOMMU_MAP_DIRECT | NOMMU_MAP_EXEC | NOMMU_MAP_READ; default: return NOMMU_MAP_COPY; } } EXPORT_SYMBOL_GPL(mtd_mmap_capabilities); #endif static int mtd_reboot_notifier(struct notifier_block *n, unsigned long state, void *cmd) { struct mtd_info *mtd; mtd = container_of(n, struct mtd_info, reboot_notifier); mtd->_reboot(mtd); return NOTIFY_DONE; } /** * mtd_wunit_to_pairing_info - get pairing information of a wunit * @mtd: pointer to new MTD device info structure * @wunit: write unit we are interested in * @info: returned pairing information * * Retrieve pairing information associated to the wunit. * This is mainly useful when dealing with MLC/TLC NANDs where pages can be * paired together, and where programming a page may influence the page it is * paired with. * The notion of page is replaced by the term wunit (write-unit) to stay * consistent with the ->writesize field. * * The @wunit argument can be extracted from an absolute offset using * mtd_offset_to_wunit(). @info is filled with the pairing information attached * to @wunit. * * From the pairing info the MTD user can find all the wunits paired with * @wunit using the following loop: * * for (i = 0; i < mtd_pairing_groups(mtd); i++) { * info.pair = i; * mtd_pairing_info_to_wunit(mtd, &info); * ... * } */ int mtd_wunit_to_pairing_info(struct mtd_info *mtd, int wunit, struct mtd_pairing_info *info) { struct mtd_info *master = mtd_get_master(mtd); int npairs = mtd_wunit_per_eb(master) / mtd_pairing_groups(master); if (wunit < 0 || wunit >= npairs) return -EINVAL; if (master->pairing && master->pairing->get_info) return master->pairing->get_info(master, wunit, info); info->group = 0; info->pair = wunit; return 0; } EXPORT_SYMBOL_GPL(mtd_wunit_to_pairing_info); /** * mtd_pairing_info_to_wunit - get wunit from pairing information * @mtd: pointer to new MTD device info structure * @info: pairing information struct * * Returns a positive number representing the wunit associated to the info * struct, or a negative error code. * * This is the reverse of mtd_wunit_to_pairing_info(), and can help one to * iterate over all wunits of a given pair (see mtd_wunit_to_pairing_info() * doc). * * It can also be used to only program the first page of each pair (i.e. * page attached to group 0), which allows one to use an MLC NAND in * software-emulated SLC mode: * * info.group = 0; * npairs = mtd_wunit_per_eb(mtd) / mtd_pairing_groups(mtd); * for (info.pair = 0; info.pair < npairs; info.pair++) { * wunit = mtd_pairing_info_to_wunit(mtd, &info); * mtd_write(mtd, mtd_wunit_to_offset(mtd, blkoffs, wunit), * mtd->writesize, &retlen, buf + (i * mtd->writesize)); * } */ int mtd_pairing_info_to_wunit(struct mtd_info *mtd, const struct mtd_pairing_info *info) { struct mtd_info *master = mtd_get_master(mtd); int ngroups = mtd_pairing_groups(master); int npairs = mtd_wunit_per_eb(master) / ngroups; if (!info || info->pair < 0 || info->pair >= npairs || info->group < 0 || info->group >= ngroups) return -EINVAL; if (master->pairing && master->pairing->get_wunit) return mtd->pairing->get_wunit(master, info); return info->pair; } EXPORT_SYMBOL_GPL(mtd_pairing_info_to_wunit); /** * mtd_pairing_groups - get the number of pairing groups * @mtd: pointer to new MTD device info structure * * Returns the number of pairing groups. * * This number is usually equal to the number of bits exposed by a single * cell, and can be used in conjunction with mtd_pairing_info_to_wunit() * to iterate over all pages of a given pair. */ int mtd_pairing_groups(struct mtd_info *mtd) { struct mtd_info *master = mtd_get_master(mtd); if (!master->pairing || !master->pairing->ngroups) return 1; return master->pairing->ngroups; } EXPORT_SYMBOL_GPL(mtd_pairing_groups); static int mtd_nvmem_reg_read(void *priv, unsigned int offset, void *val, size_t bytes) { struct mtd_info *mtd = priv; size_t retlen; int err; err = mtd_read(mtd, offset, bytes, &retlen, val); if (err && err != -EUCLEAN) return err; return retlen == bytes ? 0 : -EIO; } static int mtd_nvmem_add(struct mtd_info *mtd) { struct device_node *node = mtd_get_of_node(mtd); struct nvmem_config config = {}; config.id = NVMEM_DEVID_NONE; config.dev = &mtd->dev; config.name = dev_name(&mtd->dev); config.owner = THIS_MODULE; config.add_legacy_fixed_of_cells = of_device_is_compatible(node, "nvmem-cells"); config.reg_read = mtd_nvmem_reg_read; config.size = mtd->size; config.word_size = 1; config.stride = 1; config.read_only = true; config.root_only = true; config.ignore_wp = true; config.priv = mtd; mtd->nvmem = nvmem_register(&config); if (IS_ERR(mtd->nvmem)) { /* Just ignore if there is no NVMEM support in the kernel */ if (PTR_ERR(mtd->nvmem) == -EOPNOTSUPP) mtd->nvmem = NULL; else return dev_err_probe(&mtd->dev, PTR_ERR(mtd->nvmem), "Failed to register NVMEM device\n"); } return 0; } static void mtd_check_of_node(struct mtd_info *mtd) { struct device_node *partitions, *parent_dn, *mtd_dn = NULL; const char *pname, *prefix = "partition-"; int plen, mtd_name_len, offset, prefix_len; /* Check if MTD already has a device node */ if (mtd_get_of_node(mtd)) return; if (!mtd_is_partition(mtd)) return; parent_dn = of_node_get(mtd_get_of_node(mtd->parent)); if (!parent_dn) return; if (mtd_is_partition(mtd->parent)) partitions = of_node_get(parent_dn); else partitions = of_get_child_by_name(parent_dn, "partitions"); if (!partitions) goto exit_parent; prefix_len = strlen(prefix); mtd_name_len = strlen(mtd->name); /* Search if a partition is defined with the same name */ for_each_child_of_node(partitions, mtd_dn) { /* Skip partition with no/wrong prefix */ if (!of_node_name_prefix(mtd_dn, prefix)) continue; /* Label have priority. Check that first */ if (!of_property_read_string(mtd_dn, "label", &pname)) { offset = 0; } else { pname = mtd_dn->name; offset = prefix_len; } plen = strlen(pname) - offset; if (plen == mtd_name_len && !strncmp(mtd->name, pname + offset, plen)) { mtd_set_of_node(mtd, mtd_dn); of_node_put(mtd_dn); break; } } of_node_put(partitions); exit_parent: of_node_put(parent_dn); } /** * add_mtd_device - register an MTD device * @mtd: pointer to new MTD device info structure * * Add a device to the list of MTD devices present in the system, and * notify each currently active MTD 'user' of its arrival. Returns * zero on success or non-zero on failure. */ int add_mtd_device(struct mtd_info *mtd) { struct device_node *np = mtd_get_of_node(mtd); struct mtd_info *master = mtd_get_master(mtd); struct mtd_notifier *not; int i, error, ofidx; /* * May occur, for instance, on buggy drivers which call * mtd_device_parse_register() multiple times on the same master MTD, * especially with CONFIG_MTD_PARTITIONED_MASTER=y. */ if (WARN_ONCE(mtd->dev.type, "MTD already registered\n")) return -EEXIST; BUG_ON(mtd->writesize == 0); /* * MTD drivers should implement ->_{write,read}() or * ->_{write,read}_oob(), but not both. */ if (WARN_ON((mtd->_write && mtd->_write_oob) || (mtd->_read && mtd->_read_oob))) return -EINVAL; if (WARN_ON((!mtd->erasesize || !master->_erase) && !(mtd->flags & MTD_NO_ERASE))) return -EINVAL; /* * MTD_SLC_ON_MLC_EMULATION can only be set on partitions, when the * master is an MLC NAND and has a proper pairing scheme defined. * We also reject masters that implement ->_writev() for now, because * NAND controller drivers don't implement this hook, and adding the * SLC -> MLC address/length conversion to this path is useless if we * don't have a user. */ if (mtd->flags & MTD_SLC_ON_MLC_EMULATION && (!mtd_is_partition(mtd) || master->type != MTD_MLCNANDFLASH || !master->pairing || master->_writev)) return -EINVAL; mutex_lock(&mtd_table_mutex); ofidx = -1; if (np) ofidx = of_alias_get_id(np, "mtd"); if (ofidx >= 0) i = idr_alloc(&mtd_idr, mtd, ofidx, ofidx + 1, GFP_KERNEL); else i = idr_alloc(&mtd_idr, mtd, 0, 0, GFP_KERNEL); if (i < 0) { error = i; goto fail_locked; } mtd->index = i; kref_init(&mtd->refcnt); /* default value if not set by driver */ if (mtd->bitflip_threshold == 0) mtd->bitflip_threshold = mtd->ecc_strength; if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) { int ngroups = mtd_pairing_groups(master); mtd->erasesize /= ngroups; mtd->size = (u64)mtd_div_by_eb(mtd->size, master) * mtd->erasesize; } if (is_power_of_2(mtd->erasesize)) mtd->erasesize_shift = ffs(mtd->erasesize) - 1; else mtd->erasesize_shift = 0; if (is_power_of_2(mtd->writesize)) mtd->writesize_shift = ffs(mtd->writesize) - 1; else mtd->writesize_shift = 0; mtd->erasesize_mask = (1 << mtd->erasesize_shift) - 1; mtd->writesize_mask = (1 << mtd->writesize_shift) - 1; /* Some chips always power up locked. Unlock them now */ if ((mtd->flags & MTD_WRITEABLE) && (mtd->flags & MTD_POWERUP_LOCK)) { error = mtd_unlock(mtd, 0, mtd->size); if (error && error != -EOPNOTSUPP) printk(KERN_WARNING "%s: unlock failed, writes may not work\n", mtd->name); /* Ignore unlock failures? */ error = 0; } /* Caller should have set dev.parent to match the * physical device, if appropriate. */ mtd->dev.type = &mtd_devtype; mtd->dev.class = &mtd_class; mtd->dev.devt = MTD_DEVT(i); error = dev_set_name(&mtd->dev, "mtd%d", i); if (error) goto fail_devname; dev_set_drvdata(&mtd->dev, mtd); mtd_check_of_node(mtd); of_node_get(mtd_get_of_node(mtd)); error = device_register(&mtd->dev); if (error) { put_device(&mtd->dev); goto fail_added; } /* Add the nvmem provider */ error = mtd_nvmem_add(mtd); if (error) goto fail_nvmem_add; mtd_debugfs_populate(mtd); device_create(&mtd_class, mtd->dev.parent, MTD_DEVT(i) + 1, NULL, "mtd%dro", i); pr_debug("mtd: Giving out device %d to %s\n", i, mtd->name); /* No need to get a refcount on the module containing the notifier, since we hold the mtd_table_mutex */ list_for_each_entry(not, &mtd_notifiers, list) not->add(mtd); mutex_unlock(&mtd_table_mutex); if (of_property_read_bool(mtd_get_of_node(mtd), "linux,rootfs")) { if (IS_BUILTIN(CONFIG_MTD)) { pr_info("mtd: setting mtd%d (%s) as root device\n", mtd->index, mtd->name); ROOT_DEV = MKDEV(MTD_BLOCK_MAJOR, mtd->index); } else { pr_warn("mtd: can't set mtd%d (%s) as root device - mtd must be builtin\n", mtd->index, mtd->name); } } /* We _know_ we aren't being removed, because our caller is still holding us here. So none of this try_ nonsense, and no bitching about it either. :) */ __module_get(THIS_MODULE); return 0; fail_nvmem_add: device_unregister(&mtd->dev); fail_added: of_node_put(mtd_get_of_node(mtd)); fail_devname: idr_remove(&mtd_idr, i); fail_locked: mutex_unlock(&mtd_table_mutex); return error; } /** * del_mtd_device - unregister an MTD device * @mtd: pointer to MTD device info structure * * Remove a device from the list of MTD devices present in the system, * and notify each currently active MTD 'user' of its departure. * Returns zero on success or 1 on failure, which currently will happen * if the requested device does not appear to be present in the list. */ int del_mtd_device(struct mtd_info *mtd) { int ret; struct mtd_notifier *not; mutex_lock(&mtd_table_mutex); if (idr_find(&mtd_idr, mtd->index) != mtd) { ret = -ENODEV; goto out_error; } /* No need to get a refcount on the module containing the notifier, since we hold the mtd_table_mutex */ list_for_each_entry(not, &mtd_notifiers, list) not->remove(mtd); kref_put(&mtd->refcnt, mtd_device_release); ret = 0; out_error: mutex_unlock(&mtd_table_mutex); return ret; } /* * Set a few defaults based on the parent devices, if not provided by the * driver */ static void mtd_set_dev_defaults(struct mtd_info *mtd) { if (mtd->dev.parent) { if (!mtd->owner && mtd->dev.parent->driver) mtd->owner = mtd->dev.parent->driver->owner; if (!mtd->name) mtd->name = dev_name(mtd->dev.parent); } else { pr_debug("mtd device won't show a device symlink in sysfs\n"); } INIT_LIST_HEAD(&mtd->partitions); mutex_init(&mtd->master.partitions_lock); mutex_init(&mtd->master.chrdev_lock); } static ssize_t mtd_otp_size(struct mtd_info *mtd, bool is_user) { struct otp_info *info; ssize_t size = 0; unsigned int i; size_t retlen; int ret; info = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!info) return -ENOMEM; if (is_user) ret = mtd_get_user_prot_info(mtd, PAGE_SIZE, &retlen, info); else ret = mtd_get_fact_prot_info(mtd, PAGE_SIZE, &retlen, info); if (ret) goto err; for (i = 0; i < retlen / sizeof(*info); i++) size += info[i].length; kfree(info); return size; err: kfree(info); /* ENODATA means there is no OTP region. */ return ret == -ENODATA ? 0 : ret; } static struct nvmem_device *mtd_otp_nvmem_register(struct mtd_info *mtd, const char *compatible, int size, nvmem_reg_read_t reg_read) { struct nvmem_device *nvmem = NULL; struct nvmem_config config = {}; struct device_node *np; /* DT binding is optional */ np = of_get_compatible_child(mtd->dev.of_node, compatible); /* OTP nvmem will be registered on the physical device */ config.dev = mtd->dev.parent; config.name = compatible; config.id = NVMEM_DEVID_AUTO; config.owner = THIS_MODULE; config.add_legacy_fixed_of_cells = !mtd_type_is_nand(mtd); config.type = NVMEM_TYPE_OTP; config.root_only = true; config.ignore_wp = true; config.reg_read = reg_read; config.size = size; config.of_node = np; config.priv = mtd; nvmem = nvmem_register(&config); /* Just ignore if there is no NVMEM support in the kernel */ if (IS_ERR(nvmem) && PTR_ERR(nvmem) == -EOPNOTSUPP) nvmem = NULL; of_node_put(np); return nvmem; } static int mtd_nvmem_user_otp_reg_read(void *priv, unsigned int offset, void *val, size_t bytes) { struct mtd_info *mtd = priv; size_t retlen; int ret; ret = mtd_read_user_prot_reg(mtd, offset, bytes, &retlen, val); if (ret) return ret; return retlen == bytes ? 0 : -EIO; } static int mtd_nvmem_fact_otp_reg_read(void *priv, unsigned int offset, void *val, size_t bytes) { struct mtd_info *mtd = priv; size_t retlen; int ret; ret = mtd_read_fact_prot_reg(mtd, offset, bytes, &retlen, val); if (ret) return ret; return retlen == bytes ? 0 : -EIO; } static int mtd_otp_nvmem_add(struct mtd_info *mtd) { struct device *dev = mtd->dev.parent; struct nvmem_device *nvmem; ssize_t size; int err; if (mtd->_get_user_prot_info && mtd->_read_user_prot_reg) { size = mtd_otp_size(mtd, true); if (size < 0) { err = size; goto err; } if (size > 0) { nvmem = mtd_otp_nvmem_register(mtd, "user-otp", size, mtd_nvmem_user_otp_reg_read); if (IS_ERR(nvmem)) { err = PTR_ERR(nvmem); goto err; } mtd->otp_user_nvmem = nvmem; } } if (mtd->_get_fact_prot_info && mtd->_read_fact_prot_reg) { size = mtd_otp_size(mtd, false); if (size < 0) { err = size; goto err; } if (size > 0) { /* * The factory OTP contains thing such as a unique serial * number and is small, so let's read it out and put it * into the entropy pool. */ void *otp; otp = kmalloc(size, GFP_KERNEL); if (!otp) { err = -ENOMEM; goto err; } err = mtd_nvmem_fact_otp_reg_read(mtd, 0, otp, size); if (err < 0) { kfree(otp); goto err; } add_device_randomness(otp, err); kfree(otp); nvmem = mtd_otp_nvmem_register(mtd, "factory-otp", size, mtd_nvmem_fact_otp_reg_read); if (IS_ERR(nvmem)) { err = PTR_ERR(nvmem); goto err; } mtd->otp_factory_nvmem = nvmem; } } return 0; err: nvmem_unregister(mtd->otp_user_nvmem); /* Don't report error if OTP is not supported. */ if (err == -EOPNOTSUPP) return 0; return dev_err_probe(dev, err, "Failed to register OTP NVMEM device\n"); } /** * mtd_device_parse_register - parse partitions and register an MTD device. * * @mtd: the MTD device to register * @types: the list of MTD partition probes to try, see * 'parse_mtd_partitions()' for more information * @parser_data: MTD partition parser-specific data * @parts: fallback partition information to register, if parsing fails; * only valid if %nr_parts > %0 * @nr_parts: the number of partitions in parts, if zero then the full * MTD device is registered if no partition info is found * * This function aggregates MTD partitions parsing (done by * 'parse_mtd_partitions()') and MTD device and partitions registering. It * basically follows the most common pattern found in many MTD drivers: * * * If the MTD_PARTITIONED_MASTER option is set, then the device as a whole is * registered first. * * Then It tries to probe partitions on MTD device @mtd using parsers * specified in @types (if @types is %NULL, then the default list of parsers * is used, see 'parse_mtd_partitions()' for more information). If none are * found this functions tries to fallback to information specified in * @parts/@nr_parts. * * If no partitions were found this function just registers the MTD device * @mtd and exits. * * Returns zero in case of success and a negative error code in case of failure. */ int mtd_device_parse_register(struct mtd_info *mtd, const char * const *types, struct mtd_part_parser_data *parser_data, const struct mtd_partition *parts, int nr_parts) { int ret, err; mtd_set_dev_defaults(mtd); ret = mtd_otp_nvmem_add(mtd); if (ret) goto out; if (IS_ENABLED(CONFIG_MTD_PARTITIONED_MASTER)) { ret = add_mtd_device(mtd); if (ret) goto out; } /* Prefer parsed partitions over driver-provided fallback */ ret = parse_mtd_partitions(mtd, types, parser_data); if (ret == -EPROBE_DEFER) goto out; if (ret > 0) ret = 0; else if (nr_parts) ret = add_mtd_partitions(mtd, parts, nr_parts); else if (!device_is_registered(&mtd->dev)) ret = add_mtd_device(mtd); else ret = 0; if (ret) goto out; /* * FIXME: some drivers unfortunately call this function more than once. * So we have to check if we've already assigned the reboot notifier. * * Generally, we can make multiple calls work for most cases, but it * does cause problems with parse_mtd_partitions() above (e.g., * cmdlineparts will register partitions more than once). */ WARN_ONCE(mtd->_reboot && mtd->reboot_notifier.notifier_call, "MTD already registered\n"); if (mtd->_reboot && !mtd->reboot_notifier.notifier_call) { mtd->reboot_notifier.notifier_call = mtd_reboot_notifier; register_reboot_notifier(&mtd->reboot_notifier); } out: if (ret) { nvmem_unregister(mtd->otp_user_nvmem); nvmem_unregister(mtd->otp_factory_nvmem); } if (ret && device_is_registered(&mtd->dev)) { err = del_mtd_device(mtd); if (err) pr_err("Error when deleting MTD device (%d)\n", err); } return ret; } EXPORT_SYMBOL_GPL(mtd_device_parse_register); /** * mtd_device_unregister - unregister an existing MTD device. * * @master: the MTD device to unregister. This will unregister both the master * and any partitions if registered. */ int mtd_device_unregister(struct mtd_info *master) { int err; if (master->_reboot) { unregister_reboot_notifier(&master->reboot_notifier); memset(&master->reboot_notifier, 0, sizeof(master->reboot_notifier)); } nvmem_unregister(master->otp_user_nvmem); nvmem_unregister(master->otp_factory_nvmem); err = del_mtd_partitions(master); if (err) return err; if (!device_is_registered(&master->dev)) return 0; return del_mtd_device(master); } EXPORT_SYMBOL_GPL(mtd_device_unregister); /** * register_mtd_user - register a 'user' of MTD devices. * @new: pointer to notifier info structure * * Registers a pair of callbacks function to be called upon addition * or removal of MTD devices. Causes the 'add' callback to be immediately * invoked for each MTD device currently present in the system. */ void register_mtd_user (struct mtd_notifier *new) { struct mtd_info *mtd; mutex_lock(&mtd_table_mutex); list_add(&new->list, &mtd_notifiers); __module_get(THIS_MODULE); mtd_for_each_device(mtd) new->add(mtd); mutex_unlock(&mtd_table_mutex); } EXPORT_SYMBOL_GPL(register_mtd_user); /** * unregister_mtd_user - unregister a 'user' of MTD devices. * @old: pointer to notifier info structure * * Removes a callback function pair from the list of 'users' to be * notified upon addition or removal of MTD devices. Causes the * 'remove' callback to be immediately invoked for each MTD device * currently present in the system. */ int unregister_mtd_user (struct mtd_notifier *old) { struct mtd_info *mtd; mutex_lock(&mtd_table_mutex); module_put(THIS_MODULE); mtd_for_each_device(mtd) old->remove(mtd); list_del(&old->list); mutex_unlock(&mtd_table_mutex); return 0; } EXPORT_SYMBOL_GPL(unregister_mtd_user); /** * get_mtd_device - obtain a validated handle for an MTD device * @mtd: last known address of the required MTD device * @num: internal device number of the required MTD device * * Given a number and NULL address, retu |